diff --git a/pyPdf/generic.py b/pyPdf/generic.py index aaf5031..8d33591 100644 --- a/pyPdf/generic.py +++ b/pyPdf/generic.py @@ -404,13 +404,16 @@ def writeToStream(self, stream, encryption_key): obj = ByteStringObject(bytearr) obj.writeToStream(stream, None) else: - stream.write("(") - for c in bytearr: - if not c.isalnum() and c != ' ': - stream.write("\\%03o" % ord(c)) - else: - stream.write(c) - stream.write(")") + if bytearr == "/Page" : # correction by Dysmas : otherwise writes (\057Page) instead of /Page, which is valid but not supported by poppler + stream.write(bytearr) + else : + stream.write("(") + for c in bytearr: + if not c.isalnum() and c != ' ': + stream.write("\\%03o" % ord(c)) + else: + stream.write(c) + stream.write(")") class NameObject(str, PdfObject): @@ -797,4 +800,3 @@ def decode_pdfdocencoding(byte_array): continue assert char not in _pdfDocEncoding_rev _pdfDocEncoding_rev[char] = i - diff --git a/pyPdf/pdf.py b/pyPdf/pdf.py index bf60d01..b09eb4e 100644 --- a/pyPdf/pdf.py +++ b/pyPdf/pdf.py @@ -36,6 +36,7 @@ A pure-Python PDF library with very minimal capabilities. It was designed to be able to split and merge PDF files by page, and that's about all it can do. It may be a solid base for future PDF file work in Python. +version 1.13d """ __author__ = "Mathieu Fenniak" __author_email__ = "biziqe@mathieu.fenniak.net" @@ -1018,6 +1019,7 @@ def createBlankPage(pdf=None, width=None, height=None): page.__setitem__(NameObject('/Type'), NameObject('/Page')) page.__setitem__(NameObject('/Parent'), NullObject()) page.__setitem__(NameObject('/Resources'), DictionaryObject()) + page.__setitem__(NameObject('/Contents'), ArrayObject([])) if width is None or height is None: if pdf is not None and pdf.getNumPages() > 0: lastpage = pdf.getPage(pdf.getNumPages() - 1) @@ -1061,8 +1063,16 @@ def _mergeResources(res1, res2, resource): page2Res = res2.get(resource, DictionaryObject()).getObject() renameRes = {} for key in page2Res.keys(): - if newRes.has_key(key) and newRes[key] != page2Res[key]: - newname = NameObject(key + "renamed") + if newRes.has_key(key) and ( newRes[key] != page2Res[key] + or resource == "/XObject" ) : + i = 1 + while True : + if newRes.has_key(key + "renamed" + str(i)) : + i = i + 1 + else : + newname = NameObject(key + "renamed" + str(i)) + break + renameRes[key] = newname newRes[newname] = page2Res[key] elif not newRes.has_key(key): @@ -1070,6 +1080,7 @@ def _mergeResources(res1, res2, resource): return newRes, renameRes _mergeResources = staticmethod(_mergeResources) + def _contentStreamRename(stream, rename, pdf): if not rename: return stream @@ -1092,6 +1103,15 @@ def _pushPopGS(contents, pdf): return stream _pushPopGS = staticmethod(_pushPopGS) + + def _addCode(contents, pdf, code, endCode = ""): + + stream = ContentStream(contents, pdf) + stream.operations.insert(0, [[], code]) + stream.operations.append([[], endCode]) + return stream + _addCode = staticmethod(_addCode) + def _addTransformationMatrix(contents, pdf, ctm): # adds transformation matrix at the beginning of the given # contents stream. @@ -1349,6 +1369,107 @@ def scaleTo(self, width, height): self.mediaBox.getLowerLeft_x ()) self.scale(sx, sy) + + # Variant of the mergePage function. + # Merges the content streams of several pages and code strings into one page. + # Resource references (i.e. fonts) are maintained from all pages. + # The parameter ar_data is an array containing code strings and PageObjects. + # ContentStream is called only if necessary because it calls ParseContentStream + # which is slox. Otherwise the Content is directly extracted and added to the code. + + def mergePage3(self, ar_data ): + + newResources = DictionaryObject() + rename = {} + originalResources = self["/Resources"].getObject() + code_s = "" + + if isinstance(ar_data, PageObject) : + ar_data = [ar_data] + strType = type("x") + for data in ar_data : + if isinstance(data, PageObject) : + + # Now we work on merging the resource dictionaries. This allows us + # to find out what symbols in the content streams we might need to + # rename. + pagexResources = data["/Resources"].getObject() + + for res in "/ExtGState", "/Font", "/XObject", "/ColorSpace", "/Pattern", "/Shading": + new, newrename = PageObject._mergeResources(originalResources, pagexResources, res) + if new: + newResources[NameObject(res)] = new + rename.update(newrename) + + # Combine /Resources sets. + originalResources.update(newResources) + + # Combine /ProcSet sets. + newResources[NameObject("/ProcSet")] = ArrayObject( + frozenset(originalResources.get("/ProcSet", ArrayObject()).getObject()).union( + frozenset(pagexResources.get("/ProcSet", ArrayObject()).getObject()) + ) + ) + + if len(rename) > 0 : + pagexContent = data['/Contents'].getObject() + pagexContent = PageObject._contentStreamRename(pagexContent, rename, self.pdf) + code_s += pagexContent.getData() + "\n" + else : + page_keys = data.keys() + if "/Contents" in page_keys : # if page is not blank + code_s += self.extractContent(data["/Contents"]) + "\n" + + + else : + code_s += data + "\n" + + + originalContent = self["/Contents"].getObject() + outputContent = PageObject._addCode(originalContent, self.pdf, code_s) + + self[NameObject('/Contents')] = outputContent + self[NameObject('/Resources')] = originalResources + + + + def setContent(self, data ): + + + newResources = DictionaryObject() + rename = {} + #originalResources = self["/Resources"].getObject() + originalContent = self["/Contents"].getObject() + + stream = ContentStream(originalContent, self.pdf) + stream.operations = [] + stream.operations.append([[], data]) + + + self[NameObject('/Contents')] = stream + #self[NameObject('/Resources')] = originalResources + + + + def extractContent(self,data) : + code_s = "" + pageContent = data.getObject() + if isinstance(pageContent, ArrayObject) : + for data2 in pageContent : + code_s += self.extractContent(data2) + else : + if isinstance(data, TextStringObject) : + code_s += data + else : + try : + decodedData = filters.decodeStreamData(pageContent) + code_s += decodedData + except : + print "le code n'a pas pu etre extrait" + + return code_s + + ## # Compresses the size of this page by joining all content streams and # applying a FlateDecode filter. @@ -1552,7 +1673,8 @@ def _getData(self): op.writeToStream(newdata, None) newdata.write(" ") newdata.write(operator) - newdata.write("\n") + newdata.write("\n") #Bug corrected by Dysmas 10/2010 + return newdata.getvalue() def _setData(self, value): @@ -1868,4 +1990,3 @@ def _alg35(password, rev, keylen, owner_entry, p_entry, id1_entry, metadata_encr # output.addPage(page1) # output.write(file("test\\merge-test.pdf", "wb")) -