Skip to content

Update OCRTest to use the IRIS module #156

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 46 additions & 44 deletions Samples/OCRTest/PHP/OCRTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@

// The location of the OCR Module
PDFNet::AddResourceSearchPath("../../../Lib/");

// If the IRIS OCR module is available, will use that instead of the default
$use_iris = OCRModule::IsIRISModuleAvailable();
if(!OCRModule::IsModuleAvailable()) {
echo "Unable to run OCRTest: PDFTron SDK OCR module not available.\n
---------------------------------------------------------------\n
Expand All @@ -33,17 +36,22 @@
} else
{
//--------------------------------------------------------------------------------
// Example 1) Process image without specifying options, default language - English - is used


// Example 1) Process image
// A) Setup empty destination doc

$doc = new PDFDoc();

// B) Run OCR on the .png with options
// B) Use the IRIS OCR engine if available

OCRModule::ImageToPDF($doc, $input_path."psychomachia_excerpt.png", NULL);
$opts = new OCROptions();
if ($use_iris) {
$opts->SetOCREngine("iris");
}

// C) check the result
// C) Run OCR on the .png with options
OCRModule::ImageToPDF($doc, $input_path."psychomachia_excerpt.png", $opts);

// D) Check the result

$doc->Save($output_path."psychomachia_excerpt.pdf", 0);

Expand All @@ -60,6 +68,9 @@
// B) Setup options with multiple target languages, English will always be considered as secondary language

$opts = new OCROptions();
if ($use_iris) {
$opts->SetOCREngine("iris");
}
$opts->AddLang("deu");
$opts->AddLang("fra");
$opts->AddLang("eng");
Expand All @@ -85,6 +96,9 @@
// B) Setup options with a single language and an ignore zone

$opts = new OCROptions();
if ($use_iris) {
$opts->SetOCREngine("iris");
}
$opts->AddLang("deu");

$ignore_zones = new RectCollection();
Expand Down Expand Up @@ -112,6 +126,9 @@
// B) Setup options with a single language plus text/ignore zones

$opts = new OCROptions();
if ($use_iris) {
$opts->SetOCREngine("iris");
}
$opts->AddLang("eng");

$ignore_zones = new RectCollection();
Expand Down Expand Up @@ -160,17 +177,24 @@

$doc = new PDFDoc($input_path."zero_value_test_no_text.pdf");

// B) Run OCR on the .pdf with default English language
// B) Use the IRIS OCR engine if available

$opts = new OCROptions();
if ($use_iris) {
$opts->SetOCREngine("iris");
}

// C) Run OCR on the .pdf with default English language

$json = OCRModule::GetOCRJsonFromPDF($doc, NULL);
$json = OCRModule::GetOCRJsonFromPDF($doc, $opts);

// C) Post-processing step (whatever it might be)
// D) Post-processing step (whatever it might be)

echo "Have OCR result JSON, re-applying to PDF \n";

OCRModule::ApplyOCRJsonToPDF($doc, $json);

// D) check the result
// E) check the result

$doc->Save($output_path."zero_value_test_no_text.pdf", 0);

Expand All @@ -184,55 +208,33 @@

$doc = new PDFDoc();

// B) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
// B) Use the IRIS OCR engine if available

$opts = new OCROptions();
if ($use_iris) {
$opts->SetOCREngine("iris");
}

// C) Run OCR on the .tif with default English language, extracting OCR results in XML format. Note that
// in the process we convert the source image into PDF. We reuse this PDF document later to add hidden text layer to it.

$xml = OCRModule::GetOCRXmlFromImage($doc, $input_path."physics.tif", NULL);
$xml = OCRModule::GetOCRXmlFromImage($doc, $input_path."physics.tif", $opts);

// C) Post-processing step (whatever it might be)
// D) Post-processing step (whatever it might be)

echo "Have OCR result XML, re-applying to PDF \n";

OCRModule::ApplyOCRXmlToPDF($doc, $xml);

// D) check the result
// E) check the result

$doc->Save($output_path."physics.pdf", 0);

echo "Example 6: extracting and applying OCR XML from physics.tif \n";

echo "Done. \n";


//--------------------------------------------------------------------------------
// Example 7) Resolution can be manually set, when DPI missing from metadata or is wrong

// A) Setup empty destination doc

$doc = new PDFDoc();

// B) Setup options with a text zone

$opts = new OCROptions();
$text_zones = new RectCollection();
$text_zones->AddRect(new Rect(140.0, 870.0, 310.0, 920.0));
$opts->AddTextZonesForPage($text_zones, 1);

// C) Manually override DPI

$opts->AddDPI(100);

// D) Run OCR on the .jpg with options

OCRModule::ImageToPDF($doc, $input_path."corrupted_dpi.jpg", $opts);

// E) check the result

$doc->Save($output_path."corrupted_dpi.pdf", 0);

echo "Example 7: converting image with corrupted resolution metadata corrupted_dpi.jpg to pdf with searchable text \n";

}

PDFNet::Terminate();

?>