Answered
Mixing regular and italics in paragraph dont detect font type properly
Expected Behaviour
Italics font are properly detected when they are mixed with regular fonts
Actual Behaviour
When text mixes regular and italics fonts, italics are not properly detected.
Reproduce Scenario (including but not limited to)
A PDF document that mixes regular and italics fonts
Steps to Reproduce
- Generate a PDF document
sample.pdf - Upload to plaform
- Retrieved json file
{
"version": {
"json_export": "191",
"page_segmentation": "5",
"schema": "1.1.0",
"structure": "1.1056.0",
"table_structure": "5"
},
"extended_metadata": {
"ID_instance": "99 96 BE A8 0B B8 B2 11 0A 00 67 45 8B 6B C6 23 ",
"ID_permanent": "30 37 20 36 33 20 42 44 20 41 38 20 30 42 20 42 38 20 42 32 20 31 31 20 30 41 20 30 30 20 36 37 20 34 35 20 38 42 20 36 42 20 43 36 20 32 33 20 ",
"pdf_version": "1.6",
"pdfa_compliance_level": "",
"is_encrypted": false,
"has_acroform": false,
"is_digitally_signed": false,
"pdfua_compliance_level": "",
"page_count": 1,
"has_embedded_files": false,
"is_certified": false,
"is_XFA": false,
"language": "es-ES"
},
"elements": [
{
"Bounds": [
85.10400390625,
757.9199981689453,
389.3470916748047,
770.5939178466797],
"ClipBounds": [
85.10400390625,
757.9199981689453,
389.3470916748047,
770.5939178466797],
"Font": {
"alt_family_name": "Calibri",
"embedded": true,
"encoding": "WinAnsiEncoding",
"family_name": "Calibri",
"font_type": "TrueType",
"italic": false,
"monospaced": false,
"name": "BCDEEE+Calibri",
"subset": true,
"weight": 400
},
"HasClip": true,
"Lang": "en",
"Page": 0,
"Path": "//Document/Sect/P",
"Text": "This text mixes regulard and italics and doesn\u2019t seem to be working. ",
"TextSize": 11.039993286132812,
"attributes": {
"LineHeight": 13.25,
"SpaceAfter": 9.375
}
},
{
"Bounds": [
85.10400390625,
735.3399963378906,
389.3470916748047,
748.013916015625],
"ClipBounds": [
85.10400390625,
735.3399963378906,
389.3470916748047,
748.013916015625],
"Font": {
"alt_family_name": "Calibri",
"embedded": true,
"encoding": "WinAnsiEncoding",
"family_name": "Calibri",
"font_type": "TrueType",
"italic": false,
"monospaced": false,
"name": "BCDEEE+Calibri",
"subset": true,
"weight": 400
},
"HasClip": true,
"Lang": "en",
"Page": 0,
"Path": "//Document/Sect/P[2]",
"Text": "This text mixes regulard and italics and doesn\u2019t seem to be working. ",
"TextSize": 11.039993286132812,
"attributes": {
"LineHeight": 13.25,
"SpaceAfter": 9.25
}
},
{
"Bounds": [
87.62399291992188,
712.8999938964844,
391.86708068847656,
725.5739135742188],
"ClipBounds": [
87.62399291992188,
712.8999938964844,
391.86708068847656,
725.5739135742188],
"Font": {
"alt_family_name": "Calibri",
"embedded": true,
"encoding": "WinAnsiEncoding",
"family_name": "Calibri",
"font_type": "TrueType",
"italic": false,
"monospaced": false,
"name": "BCDEEE+Calibri",
"subset": true,
"weight": 400
},
"HasClip": true,
"Lang": "en",
"Page": 0,
"Path": "//Document/Sect/P[3]",
"Text": "This text mixes regulard and italics and doesn\u2019t seem to be working. ",
"TextSize": 11.039993286132812,
"attributes": {
"LineHeight": 13.25
}
}],
"pages": [
{
"boxes": {
"CropBox": [
0.0,
0.0,
595.3200073242188,
841.9199829101562],
"MediaBox": [
0.0,
0.0,
595.3200073242188,
841.9199829101562]
},
"height": 841.9199829101562,
"is_scanned": false,
"page_number": 0,
"rotation": 0,
"width": 595.3200073242188
}]
}
Platform and Version
Sample Code that illustrates the problem
const extractPDF2Zip_v2 = async (filepath, outputpath) => {
// Build extractPDF options
const options =
new PDFServicesSdk.ExtractPDF.options.ExtractPdfOptions.Builder()
.addElementsToExtract(
PDFServicesSdk.ExtractPDF.options.ExtractElementType.TEXT,
PDFServicesSdk.ExtractPDF.options.ExtractElementType.TABLES
)
.addElementsToExtractRenditions(
PDFServicesSdk.ExtractPDF.options.ExtractRenditionsElementType
.FIGURES,
PDFServicesSdk.ExtractPDF.options.ExtractRenditionsElementType.TABLES
)
.build();
// Create a new operation instance.
const extractPDFOperation = PDFServicesSdk.ExtractPDF.Operation.createNew()
const input = PDFServicesSdk.FileRef.createFromLocalFile( filepath,
PDFServicesSdk.ExtractPDF.SupportedSourceFormat.pdf);
// Set operation input from a source file.
extractPDFOperation.setInput(input);
// Set options
extractPDFOperation.setOptions(options);
try {
let result = await extractPDFOperation.execute(executionContext);
logger.info(`----------------------------------`)
logger.info(`PARTIAL: Save file '${outputpath}' ...`)
logger.info(`----------------------------------`)
await result.saveAsFile(outputpath);
} catch (err) {
if (err instanceof PDFServicesSdk.Error.ServiceApiError ||
err instanceof PDFServicesSdk.Error.ServiceUsageError ) {
console.log("Exception encountered while executing operation1", err);
} else {
console.log("Exception encountered while executing operation2", err);
}
}
}