pythonpdf-创新互联
# 从pdf中读取文本
# 写pdf
# 加密解密pdf
# 和平pdf,加水印
# pip install PyPDF2
%cd D:\python全站\office
import PyPDF2
D:\python全站\office
pdf_obj = open('coop.pdf', 'rb')
pdf = PyPDF2.PdfFileReader(pdf_obj)
pdf.numPages
3
page = pdf.getPage(0)
page.extractText() # 提取文件
'\n\n \n \n1\\\n1\nN¥\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n\n\n \n \n\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \nde8ug word\n \n \n'
# 提取中文 pip install pdfminer3k #支持中文
from pdfminer.pdfinterp import PDFResourceManager, process_pdf # 资源管理
from pdfminer.converter import TextConverter # 文本转换
from pdfminer.layout import LAParams #布局
from io import StringIO # 生成临时文件
def convert_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams = laparams)
fp = open(path, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
out = retstr.getvalue()
retstr.close()
return out
s = convert_pdf('coop.pdf')
# print(s)
# convert_pdf('coop.pdf')
s.split('\n\x0c')
['测试语句 \n\n第 1 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n测试语句 \n\n第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n测试语句 \n\n第一页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n ',
'测试语句 \n\n第 2 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n \n \n ',
'de8ug word \n\n测试语句 \n\n第 3 页 \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\nde8ug word \n\n \n \n ',
'']
# 写pdf,从上文打开的pdf找出第二页,新鞋一个pdf
pdf_writer = PyPDF2.PdfFileWriter()
page = pdf.getPage(1)
pdf_writer.addPage(page)
with open('coop-1.pdf', 'wb') as f:
pdf_writer.write(f)
pdf_obj.close()
# 加密pdf
with open('coop.pdf', 'rb') as f_in:
pdf = PyPDF2.PdfFileReader(f_in)
pdf_writer = PyPDF2.PdfFileWriter()
for page_num in range(pdf.numPages):
pdf_writer.addPage(pdf.getPage(page_num))
pdf_writer.encrypt('hicoop')
with open('coop-s.pdf', 'wb') as f_out:
pdf_writer.write(f_out)
# 解密
with open('coop-s.pdf', 'rb') as f_in:
pdf = PyPDF2.PdfFileReader(f_in)
print(pdf.isEncrypted)
pdf.decrypt('hicoop')
pdf.getPage(0) #取到解密后的数据才能正常操作
True
# 合并多个pdf,加水印
with open('coop.pdf', 'rb') as f_in:
with open('coop-watermarked.pdf', 'rb') as f_w:
pdf = PyPDF2.PdfFileReader(f_in)
pdf_w = PyPDF2.PdfFileReader(f_w)
pdf_write = PyPDF2.PdfFileWriter()
for page_num in range(pdf.numPages):
page = pdf.getPage(page_num)
page.mergePage(pdf_w.getPage(0))
pdf_write.addPage(page)
with open('coop-watermarked.pdf', 'wb') as f_out:
pdf_write.write(f_out)
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
in ()
3 with open('coop-watermarked.pdf', 'rb') as f_w:
4 pdf = PyPDF2.PdfFileReader(f_in)
----> 5 pdf_w = PyPDF2.PdfFileReader(f_w)
6
7 pdf_write = PyPDF2.PdfFileWriter()
c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in __init__(self, stream, strict, warndest, overwriteWarnings)
1082 stream = BytesIO(b_(fileobj.read()))
1083 fileobj.close()
-> 1084 self.read(stream)
1085 self.stream = stream
1086
c:\users\coop\miniconda3\envs\coop\lib\site-packages\PyPDF2\pdf.py in read(self, stream)
1687 if debug: print(">>read", stream)
1688 # start at the end:
-> 1689 stream.seek(-1, 2)
1690 if not stream.tell():
1691 raise utils.PdfReadError('Cannot read an empty file')
OSError: [Errno 22] Invalid argument
另外有需要云服务器可以了解下创新互联scvps.cn,海内外云服务器15元起步,三天无理由+7*72小时售后在线,公司持有idc许可证,提供“云服务器、裸金属服务器、高防服务器、香港服务器、美国服务器、虚拟主机、免备案服务器”等云主机租用服务以及企业上云的综合解决方案,具有“安全稳定、简单易用、服务可用性高、性价比高”等特点与优势,专为企业上云打造定制,能够满足用户丰富、多元化的应用场景需求。
为西市等地区用户提供了全套网页设计制作服务,及西市网站建设行业解决方案。主营业务为成都网站建设、成都网站设计、西市网站设计,以传统方式定制建设网站,并提供域名空间备案等一条龙服务,秉承以专业、用心的态度为用户提供真诚的服务。我们深信只要达到每一位用户的要求,就会得到认可,从而选择与我们长期合作。这样,我们也可以走得更远!分享文章:pythonpdf-创新互联
链接地址:http://lswzjz.com/article/cojgsd.html