xxxxxxxxxx
# PDF Extracting with Python
# pip install textract
# pip install tabula-py
# pip install PyMupdf
import textract as extract
import tabula as tb
import fitz
def Extract_Text(pdf):
pdf = extract.process('test.pdf')
print("Text: ", pdf)
def Extract_Photos(pdf):
doc = fitz.open('test.pdf')
i = 1
for page in doc:
for img in page.getImageList():
xref = img[0]
pix = page.getPixmap(xref)
pix.writePNG(f'test_{i}.png')
print("Image: ", pix)
i += 1
def Extract_Tables(pdf):
table = tb.read_pdf('test.pdf', pages='all', multiple_tables=True)
# save in csv
tb.convert_into('test.pdf', 'test.csv', output_format='csv', pages='all')
# save in excel
tb.convert_into('test.pdf', 'test.xlsx', output_format='xlsx', pages='all')
xxxxxxxxxx
# PDF Extracting with Python
# pip install textract
# pip install tabula-py
# pip install PyMupdf
import textract as extract
import tabula as tb
import fitz
def Extract_Text(pdf):
pdf = extract.process('test.pdf')
print("Text: ", pdf)
def Extract_Photos(pdf):
doc = fitz.open('test.pdf')
i = 1
for page in doc:
for img in page.getImageList():
xref = img[0]
pix = page.getPixmap(xref)
pix.writePNG(f'test_{i}.png')
print("Image: ", pix)
i += 1
def Extract_Tables(pdf):
table = tb.read_pdf('test.pdf', pages='all', multiple_tables=True)
# save in csv
tb.convert_into('test.pdf', 'test.csv', output_format='csv', pages='all')
# save in excel
tb.convert_into('test.pdf', 'test.xlsx', output_format='xlsx', pages='all')