itext7 generation pdfA - 1B validation problem - validation

I am using itext7 in java to create a 1-B PdfA using this simple code:
import com.itextpdf.kernel.font.PdfFont;
import com.itextpdf.kernel.font.PdfFontFactory;
import com.itextpdf.kernel.geom.PageSize;
import com.itextpdf.kernel.pdf.PdfAConformanceLevel;
import com.itextpdf.kernel.pdf.PdfDocumentInfo;
import com.itextpdf.kernel.pdf.PdfOutputIntent;
import com.itextpdf.kernel.pdf.PdfString;
import com.itextpdf.kernel.pdf.PdfViewerPreferences;
import com.itextpdf.kernel.pdf.PdfWriter;
import com.itextpdf.kernel.xmp.PdfConst;
import com.itextpdf.kernel.xmp.XMPConst;
import com.itextpdf.kernel.xmp.XMPException;
import com.itextpdf.kernel.xmp.XMPMeta;
import com.itextpdf.kernel.xmp.XMPMetaFactory;
import com.itextpdf.layout.Document;
import com.itextpdf.layout.element.Paragraph;
import com.itextpdf.pdfa.PdfADocument;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.SimpleDateFormat;
import gestionepdf.PdfDocument_Configurazione_Text;
/**
*
* #author UC9001309
*/
public class TestCreatePDFA {
public static final String courier = "C:\\Windows\\fonts\\couri.ttf";
public static final String times = "C:\\Windows\\fonts\\times.ttf";
public static final String helvetica = "C:\\Windows\\fonts\\helvetica.ttf";
public static void main(String[] args) throws FileNotFoundException, IOException, XMPException {
// TODO code application logic here
PdfWriter pdfWriter = new PdfWriter("C:\\Temp\\" + new SimpleDateFormat("yyyyMMddHHmmss").format(new java.util.Date()) + ".pdf");
PdfADocument pdfA = new PdfADocument( pdfWriter,PdfAConformanceLevel.PDF_A_1B,new PdfOutputIntent("Custom", "", "https://www.color.org",
"sRGB2014", new FileInputStream("C:\\Users\\UC9001309\\Documents\\NetBeansProjects\\GestionePdf\\sRGB2014.icc")));
Document document = new Document(pdfA, PageSize.A4, false);
XMPMeta meta = XMPMetaFactory.create();
PdfDocumentInfo info = pdfA.getDocumentInfo();
pdfA.getCatalog().setViewerPreferences(new PdfViewerPreferences().setDisplayDocTitle(true));
pdfA.getCatalog().setLang(new PdfString("it-IT"));
info.addCreationDate();
meta.setProperty(XMPConst.NS_XMP, PdfConst.CreateDate, info.getMoreInfo("CreationDate"));
info.addModDate();
meta.setProperty(XMPConst.NS_XMP, PdfConst.ModifyDate, info.getMoreInfo("ModDate"));
info.setAuthor("MyAuthor");
info.setCreator("MyCreator");
info.setProducer("Producer");
info.setTitle("TEST PdfA " );
PdfFont font_h = PdfFontFactory.createFont(helvetica);
PdfFont font_c = PdfFontFactory.createFont(courier);
PdfFont font_t = PdfFontFactory.createFont(times);
Paragraph p = new Paragraph();
p.setFont(font_c);
p.setItalic();
p.add("Prova pdfa");
p.getAccessibilityProperties().setRole("P");
Paragraph p1 = new Paragraph();
p1.setFont(font_h);
p1.add("Prova pdfa");
p1.getAccessibilityProperties().setRole("P");
Paragraph p2 = new Paragraph();
p2.setFont(font_t);
p2.add("Prova pdfa");
p2.getAccessibilityProperties().setRole("P");
document.add(p);
document.add(p1);
document.add(p2);
pdfA.setXmpMetadata(meta);
document.close();
}
}
But if I try to validate, using online sites some are fine with others not. In particular, a validator gives me this error :
<?xml version="1.0" encoding="UTF-8"?>
-<ValidationReport>
<VersionInformation Version="14.1.182" ID="GdPicture.NET.14"/>
<ValidationProfile Level="B" Part="1" Conformance="PDF/A"/>
<FileInfo FileSize="57849 bytes" FileName="20220726115840.pdf"/>
<ValidationResult Statement="PDF file is not compliant with validation profile requirements." sCompliant="False"/>
-<Details>
-<FailedChecks Count="4">
-<Check ID="InconsistentPDFInfo" OccurenceCount="4">
<Occurence Statement="The PDF Information Dictionary entry CreationDate is not consistent with PDF XMP metadata information." ObjReference="None" Context="Document"/>
<Occurence Statement="The PDF Information Dictionary entry Creator is not consistent with PDF XMP metadata information." ObjReference="None" Context="Document"/>
<Occurence Statement="The PDF Information Dictionary entry ModDate is not consistent with PDF XMP metadata information." ObjReference="None" Context="Document"/>
<Occurence Statement="The PDF Information Dictionary entry Producer is not consistent with PDF XMP metadata information." ObjReference="None" Context="Document"/>
</Check>
</FailedChecks>
</Details>
</ValidationReport>
This is xmp of my pdfa :
<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.1.0-jc003">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about=""
xmlns:xmp="http://ns.adobe.com/xap/1.0/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:pdf="http://ns.adobe.com/pdf/1.3/"
xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
xmp:CreateDate="2022-07-26T16:13:07+02:00"
xmp:ModifyDate="2022-07-26T16:13:07+02:00"
xmp:CreatorTool="MyCreator"
pdf:Producer="Producer; modified using iText® Core 7.2.2 (AGPL version) ©2000-2022 iText Group NV"
pdfaid:part="1"
pdfaid:conformance="B">
<dc:creator>
<rdf:Seq>
<rdf:li>MyAuthor</rdf:li>
</rdf:Seq>
</dc:creator>
<dc:title>
<rdf:Alt>
<rdf:li xml:lang="x-default">TEST PdfA </rdf:li>
</rdf:Alt>
</dc:title>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>
This is the dictionary :
Author - MyAuthor
CreationDate - D:20220727094440+02'00'
Creator - MyCreator
ModDate - D:20220727094440+02'00'
Producer - Producer
Title - TEST PdfA
I don't understand if I'm wrong or the validator?
Thanks in advance for anyone who wants to help !

Related

itext 7: converting HTML to PDF fails when using landscape mode in some cases (test repo link included)

I created a small GitHub repo that demonstrates the problem:
See https://github.com/abrighton/itext-bug.
The repo contains a generated HTML file (TEST.html) that causes itext 7 to throw an exception when converting to PDF in landscape mode:
Exception in thread "main" java.lang.UnsupportedOperationException
at com.itextpdf.layout.renderer.AreaBreakRenderer.draw(AreaBreakRenderer.java:83)
at com.itextpdf.layout.renderer.AbstractRenderer.drawChildren(AbstractRenderer.java:855)
at com.itextpdf.layout.renderer.BlockRenderer.draw(BlockRenderer.java:580)
at com.itextpdf.layout.renderer.AbstractRenderer.drawChildren(AbstractRenderer.java:855)
at com.itextpdf.layout.renderer.BlockRenderer.draw(BlockRenderer.java:580)
at com.itextpdf.layout.renderer.DocumentRenderer.flushSingleRenderer(DocumentRenderer.java:147)
at com.itextpdf.layout.renderer.RootRenderer.processRenderer(RootRenderer.java:380)
at com.itextpdf.layout.renderer.RootRenderer.shrinkCurrentAreaAndProcessRenderer(RootRenderer.java:369)
at com.itextpdf.html2pdf.attach.impl.layout.HtmlDocumentRenderer.shrinkCurrentAreaAndProcessRenderer(HtmlDocumentRenderer.java:347)
at com.itextpdf.layout.renderer.RootRenderer.addChild(RootRenderer.java:264)
at com.itextpdf.html2pdf.attach.impl.layout.HtmlDocumentRenderer.processWaitingElement(HtmlDocumentRenderer.java:234)
at com.itextpdf.html2pdf.attach.impl.layout.HtmlDocumentRenderer.close(HtmlDocumentRenderer.java:194)
at com.itextpdf.layout.Document.close(Document.java:135)
at com.itextpdf.html2pdf.HtmlConverter.convertToPdf(HtmlConverter.java:261)
at com.itextpdf.html2pdf.HtmlConverter.convertToPdf(HtmlConverter.java:221)
at ItextBug$.saveAsPdf(ItextBug.scala:15)
at ItextBug$.delayedEndpoint$ItextBug$1(ItextBug.scala:23)
at ItextBug$delayedInit$body.apply(ItextBug.scala:9)
at scala.Function0.apply$mcV$sp(Function0.scala:39)
at scala.Function0.apply$mcV$sp$(Function0.scala:39)
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:17)
at scala.App.$anonfun$main$1(App.scala:73)
at scala.App.$anonfun$main$1$adapted(App.scala:73)
at scala.collection.IterableOnceOps.foreach(IterableOnce.scala:553)
at scala.collection.IterableOnceOps.foreach$(IterableOnce.scala:551)
at scala.collection.AbstractIterable.foreach(Iterable.scala:921)
at scala.App.main(App.scala:73)
at scala.App.main$(App.scala:71)
at ItextBug$.main(ItextBug.scala:9)
at ItextBug.main(ItextBug.scala)
Here is the code:
import java.io.{ByteArrayInputStream, FileOutputStream, OutputStream}
import java.nio.file.{Files, Paths}
import com.itextpdf.html2pdf.HtmlConverter
import com.itextpdf.kernel.geom.PageSize
import com.itextpdf.kernel.pdf.{PdfDocument, PdfWriter}
// Run this from the directory containing TEST.html
object ItextBug extends App {
def saveAsPdf(out: OutputStream, html: String, orientation: String): Unit = {
val pageSize = if (orientation == "landscape") PageSize.LETTER.rotate() else PageSize.LETTER
val writer: PdfWriter = new PdfWriter(out)
val document: PdfDocument = new PdfDocument(writer)
document.setDefaultPageSize(pageSize)
HtmlConverter.convertToPdf(new ByteArrayInputStream(html.getBytes()), document)
out.close()
}
val html = new String(Files.readAllBytes(Paths.get("TEST.html")))
val out = new FileOutputStream("TEST.pdf")
// This version crashes
saveAsPdf(out, html, "landscape")
// This version works
// saveAsPdf(out, html, "portrait")
}
Is there anything wrong with this code?
I have only seen this happen on certain input HTML files. There could be something odd in there, however the HTML displays fine in the browser. Browsers don't throw exceptions for bad HTML and the HTML to PDF converter probably should not either, assuming that is the problem.
(Uses Scala-2.13.1, Java-11)

Using ImageIcon to access a picture, can't access it, how to fix?

I am using ImageIcon to access a photo I have cropped. I put all the cropped pictures in a pic source folder in side the project. Yet when I try to use this.getClass().getResource("image 2.png") to find the image 2.png photo, the code couldn't find it. Is there anyway to fix this, do I have to re upload all the picture into a different folder?
the "image 2.png" is inside the pic source folder, which is within the folder of the project Alle, according to the navigator panel on the right. (I am using eclipse)
Here is my code:
import java.awt.*;
import java.io.File;
import java.io.IOException;
import javax.swing.*;
public class alle extends JFrame {
JButton button1, button2;
JLabel Label1, Label2;
ImageIcon Icon1;
ImageIcon Icon2;
public alle() {
setLayout(new GridLayout(2,3)); //create a gridwolrd like thing that's like 2 row, 3 column
button1 = new JButton("set");
add(button1);
Label1 = new JLabel(" button");
add(Label1);
Icon1 = new ImageIcon(this.getClass().getResource("/alle/pic/image 2.png")) ;
JLabel p = new JLabel(Icon1);
add(p);
}
public static void main (String arg[]) {
alle adfc = new alle();
adfc.setResizable(false);
adfc.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
adfc.setVisible(true);
adfc.pack();
File f = new File ("/alle/pic/image 2.png");
System.out.print(f.exists());}}
I also had this kind of problem about a year ago. You should be able to resolve it by writing a loader method for retrieving the images. You can find a detailed answer on this article which also worked for me: Images Won't Appear In A Jar
If you don't want to read through the whole post here's the code sample that should resolve things, you only need to adjust it to your needs:
public ImageIcon loadIcon(String iconName) throws IOException {
ClassLoader loader = this.getClass().getClassLoader();
BufferedImage icon = ImageIO.read(loader.getResourceAsStream(iconName));
return new ImageIcon(icon);
}
From this method you should be able to retrieve your images. I hope this helps you.

Spark Scala how to execute

I have written the following code, which returns a "Class not found" exception. I'm not sure what I need to do to load data from a csv file into SparkSQL.
import org.apache.spark.SparkContext
/**
* Loading sales csv using DataFrame API
*/
object CsvDataInput {
def main(args: Array[String]) {
val sc = new SparkContext(args(0), "Csv loading example")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val df = sqlContext.load("com.databricks.spark.csv", Map("path" -> args(1),"header"->"true"))
df.printSchema()
df.registerTempTable("data")
val aggDF = sqlContext.sql("select * from data")
println(aggDF.collectAsList())
}
}
Try replacing this line
import org.apache.spark.SparkContext
with this
import org.apache.spark.*
You are importing just part of the library, but using classes from outside this part. Also, your import is actually misspelled - it should read org.apache.spark.sql.SQLContext, and you used some other package, not related to the code presented.

PDF Box generating blank images due to JBIG2 Images in it

Let me give you an overview of my project first. I have a pdf which I need to convert into images(One image for one page) using PDFBox API and write all those images onto a new pdf using PDFBox API itself. Basically, converting a pdf into a pdf, which we refer to as PDF Transcoding.
For certain pdfs, which contain JBIG2 images, PDFbox implementation of convertToImage() method is failing silently without any exceptions or errors and finally, producing a PDF, but this time, just with blank content(white). The message I am getting on the console is:
Dec 06, 2013 5:15:42 PM org.apache.pdfbox.filter.JBIG2Filter decode
SEVERE: Can't find an ImageIO plugin to decode the JBIG2 encoded datastream.
Dec 06, 2013 5:15:42 PM org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap getRGBImage
SEVERE: Something went wrong ... the pixelmap doesn't contain any data.
Dec 06, 2013 5:15:42 PM org.apache.pdfbox.util.operator.pagedrawer.Invoke process
WARNING: getRGBImage returned NULL
I need to know how to resolve this issue? We have something like:
import org.apache.pdfbox.filter.JBIG2Filter;
which I don't know how to implement.
I am searching on that, but to no avail. Could anyone please suggest?
Take a look at this ticket in PDFBox https://issues.apache.org/jira/browse/PDFBOX-1067 . I think the answer to your question is:
to make sure that you have JAI and the JAI-ImageIO plugins installed for your version of Java: decent installation instructions are available here: http://docs.geoserver.org/latest/en/user/production/java.html
to use the JBIG2-imageio plugin, (newer versions are licensed under the Apache2 license) https://github.com/levigo/jbig2-imageio/
I had the same problem and I fixed it by adding this dependency in my pom.xml :
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jbig2-imageio</artifactId>
<version>3.0.2</version>
</dependency>
Good luck.
I had the exact same problem.
I downloaded the jar from
jbig2-imageio
and I just included it in my project's application libraries, and it worked right out of the box. As adam said, it uses GPL3.
Installing the JAI seems not needed.
I only needed to download the levigo-jbig2-imageio-1.6.5.jar, place it in the folder of my dependency-jars and in eclipse add it to the java build path libraries.
https://github.com/levigo/jbig2-imageio/
import java.awt.image.BufferedImage
import org.apache.pdfbox.cos.COSName
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.PDPage
import org.apache.pdfbox.pdmodel.PDPageTree
import org.apache.pdfbox.pdmodel.PDResources
import org.apache.pdfbox.pdmodel.graphics.PDXObject
import org.apache.pdfbox.rendering.ImageType
import org.apache.pdfbox.rendering.PDFRenderer
import org.apache.pdfbox.tools.imageio.ImageIOUtil
import javax.imageio.ImageIO
import javax.imageio.spi.IIORegistry
import javax.imageio.spi.ImageReaderSpi
import javax.swing.*
import javax.swing.filechooser.FileNameExtensionFilter
public class savePDFAsImage{
String path = "c:/pdfImage/"
//allow pdf file selection for extracting
public static File selectPDF() {
File file = null
JFileChooser chooser = new JFileChooser()
FileNameExtensionFilter filter = new FileNameExtensionFilter("PDF", "pdf")
chooser.setFileFilter(filter)
chooser.setMultiSelectionEnabled(false)
int returnVal = chooser.showOpenDialog(null)
if (returnVal == JFileChooser.APPROVE_OPTION) {
file = chooser.getSelectedFile()
println "Please wait..."
}
return file
}
public static void main(String[] args) {
try {
// help to view list of plugin registered. check by adding JBig2 plugin and JAI plugin
ImageIO.scanForPlugins()
IIORegistry reg = IIORegistry.getDefaultInstance()
Iterator spIt = reg.getServiceProviders(ImageReaderSpi.class, false)
spIt.each(){
println it.getProperties()
}
testPDFBoxSaveAsImage()
testPDFBoxExtractImagesX()
} catch (Exception e) {
e.printStackTrace()
}
}
public static void testPDFBoxExtractImagesX() throws Exception {
PDDocument document = PDDocument.load(selectPDF())
PDPageTree list = document.getPages()
for (PDPage page : list) {
PDResources pdResources = page.getResources()
for (COSName c : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(c)
if (o instanceof org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) {
File file = new File( + System.nanoTime() + ".png")
ImageIO.write(((org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject) o).getImage(), "png", file)
}
}
}
document.close()
println "Extraction complete"
}
public static void testPDFBoxSaveAsImage() throws Exception {
PDDocument document = PDDocument.load(selectPDF().getBytes())
PDFRenderer pdfRenderer = new PDFRenderer(document)
for (int page = 0; page < document.getNumberOfPages(); ++page) {
BufferedImage bim = pdfRenderer.renderImageWithDPI(page,300, ImageType.BINARY)
// suffix in filename will be used as the file format
OutputStream fileOutputStream = new FileOutputStream(+ System.nanoTime() + ".png")
boolean b = ImageIOUtil.writeImage(bim, "png",fileOutputStream,300)
}
document.close()
println "Extraction complete"
}
}

Interpreting output from mahout clusterdumper

I ran a clustering test on crawled pages (more than 25K docs ; personal data set).
I've done a clusterdump :
$MAHOUT_HOME/bin/mahout clusterdump --seqFileDir output/clusters-1/ --output clusteranalyze.txt
The output after running cluster dumper is shown 25 elements "VL-xxxxx {}" :
VL-24130{n=1312 c=[0:0.017, 10:0.007, 11:0.005, 14:0.017, 31:0.016, 35:0.006, 41:0.010, 43:0.008, 52:0.005, 59:0.010, 68:0.037, 72:0.056, 87:0.028, ... ] r=[0:0.442, 10:0.271, 11:0.198, 14:0.369, 31:0.421, ... ]}
...
VL-24868{n=311 c=[0:0.042, 11:0.016, 17:0.046, 72:0.014, 96:0.044, 118:0.015, 135:0.016, 195:0.017, 318:0.040, 319:0.037, 320:0.036, 330:0.030, ...] ] r=[0:0.740, 11:0.287, 17:0.576, 72:0.239, 96:0.549, 118:0.273, ...]}
How to interpret this output?
In short : I am looking for document ids which belong to a particular cluster.
What is the meaning of :
VL-x ?
n=y c=[z:z', ...]
r=[z'':z''', ...]
Does 0:0.017 means "0" is the document id which belongs to this cluster?
I already have read on mahout wiki-pages what CL, n, c and r means. But can someone please explain them to me better or points to a resource where it is explained a bit more in detail?
Sorry, if i am asking some stupid questions, but i am a newbie wih apache mahout and using it as part of my course assignment for clustering.
By default, kmeans clustering uses WeightedVector which does not include the data point name. So, you would like to make a sequence file yourself using NamedVector. There is a one to one correspondence between the number of seq files and the mapping tasks. So if your mapping capacity is 12, you want to chop your data into 12 pieces when making seqfiles
NamedVecotr:
vector = new NamedVector(new SequentialAccessSparseVector(Cardinality),arrField[0]);
Basically you need to download the clusteredPoints from your HDFS system and write your own code to output the results. Here is the code that I wrote to output the cluster point membership.
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.NamedVector;
public class ClusterOutput {
/**
* #param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
BufferedWriter bw;
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
File pointsFolder = new File(args[0]);
File files[] = pointsFolder.listFiles();
bw = new BufferedWriter(new FileWriter(new File(args[1])));
HashMap<String, Integer> clusterIds;
clusterIds = new HashMap<String, Integer>(5000);
for(File file:files){
if(file.getName().indexOf("part-m")<0)
continue;
SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(file.getAbsolutePath()), conf);
IntWritable key = new IntWritable();
WeightedVectorWritable value = new WeightedVectorWritable();
while (reader.next(key, value)) {
NamedVector vector = (NamedVector) value.getVector();
String vectorName = vector.getName();
bw.write(vectorName + "\t" + key.toString()+"\n");
if(clusterIds.containsKey(key.toString())){
clusterIds.put(key.toString(), clusterIds.get(key.toString())+1);
}
else
clusterIds.put(key.toString(), 1);
}
bw.flush();
reader.close();
}
bw.flush();
bw.close();
bw = new BufferedWriter(new FileWriter(new File(args[2])));
Set<String> keys=clusterIds.keySet();
for(String key:keys){
bw.write(key+" "+clusterIds.get(key)+"\n");
}
bw.flush();
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
To complete the answer:
VL-x: is the identifier of the cluster
n=y: is the number of elements in the cluster
c=[z, ...]: is the centroid of the cluster, with the
z's being the weights of the different dimensions
r=[z, ...]: is the radius of the cluster.
More info here:
https://mahout.apache.org/users/clustering/cluster-dumper.html
I think you need to read the source code -- download from http://mahout.apache.org. VL-24130 is just a cluster identifier for a converged cluster.
You can use mahout clusterdump
https://cwiki.apache.org/MAHOUT/cluster-dumper.html

Resources