Skip to main content
Participant
September 26, 2023
Answered

Mixing regular and italics in paragraph dont detect font type properly

  • September 26, 2023
  • 1 reply
  • 949 views

Expected Behaviour

Italics font are properly detected when they are mixed with regular fonts

Actual Behaviour

When text mixes regular and italics fonts, italics are not properly detected.

Reproduce Scenario (including but not limited to)

A PDF document that mixes regular and italics fonts

Steps to Reproduce

  1. Generate a PDF document
    sample.pdf
  2. Upload to plaform
  3. Retrieved json file
{
  "version": {
    "json_export": "191",
    "page_segmentation": "5",
    "schema": "1.1.0",
    "structure": "1.1056.0",
    "table_structure": "5"
  },
  "extended_metadata": {
    "ID_instance": "99 96 BE A8 0B B8 B2 11 0A 00 67 45 8B 6B C6 23 ",
    "ID_permanent": "30 37 20 36 33 20 42 44 20 41 38 20 30 42 20 42 38 20 42 32 20 31 31 20 30 41 20 30 30 20 36 37 20 34 35 20 38 42 20 36 42 20 43 36 20 32 33 20 ",
    "pdf_version": "1.6",
    "pdfa_compliance_level": "",
    "is_encrypted": false,
    "has_acroform": false,
    "is_digitally_signed": false,
    "pdfua_compliance_level": "",
    "page_count": 1,
    "has_embedded_files": false,
    "is_certified": false,
    "is_XFA": false,
    "language": "es-ES"
  },
  "elements": [
    {
      "Bounds": [
        85.10400390625,
        757.9199981689453,
        389.3470916748047,
        770.5939178466797],
      "ClipBounds": [
        85.10400390625,
        757.9199981689453,
        389.3470916748047,
        770.5939178466797],
      "Font": {
        "alt_family_name": "Calibri",
        "embedded": true,
        "encoding": "WinAnsiEncoding",
        "family_name": "Calibri",
        "font_type": "TrueType",
        "italic": false,
        "monospaced": false,
        "name": "BCDEEE+Calibri",
        "subset": true,
        "weight": 400
      },
      "HasClip": true,
      "Lang": "en",
      "Page": 0,
      "Path": "//Document/Sect/P",
      "Text": "This text mixes regulard and italics and doesn\u2019t seem to be working. ",
      "TextSize": 11.039993286132812,
      "attributes": {
        "LineHeight": 13.25,
        "SpaceAfter": 9.375
      }
    },
    {
      "Bounds": [
        85.10400390625,
        735.3399963378906,
        389.3470916748047,
        748.013916015625],
      "ClipBounds": [
        85.10400390625,
        735.3399963378906,
        389.3470916748047,
        748.013916015625],
      "Font": {
        "alt_family_name": "Calibri",
        "embedded": true,
        "encoding": "WinAnsiEncoding",
        "family_name": "Calibri",
        "font_type": "TrueType",
        "italic": false,
        "monospaced": false,
        "name": "BCDEEE+Calibri",
        "subset": true,
        "weight": 400
      },
      "HasClip": true,
      "Lang": "en",
      "Page": 0,
      "Path": "//Document/Sect/P[2]",
      "Text": "This text mixes regulard and italics and doesn\u2019t seem to be working. ",
      "TextSize": 11.039993286132812,
      "attributes": {
        "LineHeight": 13.25,
        "SpaceAfter": 9.25
      }
    },
    {
      "Bounds": [
        87.62399291992188,
        712.8999938964844,
        391.86708068847656,
        725.5739135742188],
      "ClipBounds": [
        87.62399291992188,
        712.8999938964844,
        391.86708068847656,
        725.5739135742188],
      "Font": {
        "alt_family_name": "Calibri",
        "embedded": true,
        "encoding": "WinAnsiEncoding",
        "family_name": "Calibri",
        "font_type": "TrueType",
        "italic": false,
        "monospaced": false,
        "name": "BCDEEE+Calibri",
        "subset": true,
        "weight": 400
      },
      "HasClip": true,
      "Lang": "en",
      "Page": 0,
      "Path": "//Document/Sect/P[3]",
      "Text": "This text mixes regulard and italics and doesn\u2019t seem to be working. ",
      "TextSize": 11.039993286132812,
      "attributes": {
        "LineHeight": 13.25
      }
    }],
  "pages": [
    {
      "boxes": {
        "CropBox": [
          0.0,
          0.0,
          595.3200073242188,
          841.9199829101562],
        "MediaBox": [
          0.0,
          0.0,
          595.3200073242188,
          841.9199829101562]
      },
      "height": 841.9199829101562,
      "is_scanned": false,
      "page_number": 0,
      "rotation": 0,
      "width": 595.3200073242188
    }]
}

 

Platform and Version

Sample Code that illustrates the problem

const extractPDF2Zip_v2 = async (filepath, outputpath) => {
  // Build extractPDF options
  const options =
    new PDFServicesSdk.ExtractPDF.options.ExtractPdfOptions.Builder()
      .addElementsToExtract(
        PDFServicesSdk.ExtractPDF.options.ExtractElementType.TEXT,
        PDFServicesSdk.ExtractPDF.options.ExtractElementType.TABLES
      )
      .addElementsToExtractRenditions(
        PDFServicesSdk.ExtractPDF.options.ExtractRenditionsElementType
          .FIGURES,
        PDFServicesSdk.ExtractPDF.options.ExtractRenditionsElementType.TABLES
      )
      .build();

  // Create a new operation instance.
  const extractPDFOperation = PDFServicesSdk.ExtractPDF.Operation.createNew()
  const input = PDFServicesSdk.FileRef.createFromLocalFile( filepath,
      PDFServicesSdk.ExtractPDF.SupportedSourceFormat.pdf);

  // Set operation input from a source file.
  extractPDFOperation.setInput(input);

  // Set options
  extractPDFOperation.setOptions(options);

  try {
    let result = await extractPDFOperation.execute(executionContext);
    logger.info(`----------------------------------`)
    logger.info(`PARTIAL: Save file '${outputpath}' ...`)
    logger.info(`----------------------------------`)
    await result.saveAsFile(outputpath);
  } catch (err) {
    if (err instanceof PDFServicesSdk.Error.ServiceApiError || 
        err instanceof PDFServicesSdk.Error.ServiceUsageError ) {
      console.log("Exception encountered while executing operation1", err);
    } else {
      console.log("Exception encountered while executing operation2", err);
    }
  }
}
This topic has been closed for replies.
Correct answer Joel Geraci

Add...

.getStylingInfo(true)
..to your options. Getting styles info is computationally expensive so we wanted to make it an option. The default is not to but you can turn it on with the setting above. In paragraphs with mixed type, you'll see a "kids" property with a bunch of spans. One span for each style.

1 reply

Joel Geraci
Community Expert
Joel GeraciCommunity ExpertCorrect answer
Community Expert
September 26, 2023

Add...

.getStylingInfo(true)
..to your options. Getting styles info is computationally expensive so we wanted to make it an option. The default is not to but you can turn it on with the setting above. In paragraphs with mixed type, you'll see a "kids" property with a bunch of spans. One span for each style.
Participant
September 26, 2023

THANK YOu much, it help me as well.... I appreciate your effort