xxxxxxxxxx
# Call API to convert PDF to Excel format
response = pdf_api.put_pdf_in_storage_to_xlsx(name=input_file, out_path=resultant_file_name, uniform_worksheets='true')
for complete details, please check the blog https://blog.aspose.cloud/2021/11/27/convert-excel-files-to-pdf-in-python/
xxxxxxxxxx
# Import the required Module
import tabula
# Read a PDF File
df = tabula.read_pdf("IPLmatch.pdf", pages='all')[0]
# convert PDF into CSV
tabula.convert_into("IPLmatch.pdf", "iplmatch.csv", output_format="csv", pages='all')
print(df)
xxxxxxxxxx
import PyPDF2
import pandas as pd
def extract_text_from_pdf(pdf_file):
text = ""
with open(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfFileReader(file)
num_pages = pdf_reader.numPages
for page_num in range(num_pages):
page = pdf_reader.getPage(page_num)
text += page.extract_text()
return text
def save_to_excel(text, excel_file):
# Assuming the data is tab-separated
data = [line.split('\t') for line in text.split('\n')]
# Convert to a DataFrame and save to Excel
df = pd.DataFrame(data)
df.to_excel(excel_file, index=False, header=False)
if __name__ == "__main__":
pdf_file_path = "path/to/your/pdf_file.pdf"
excel_file_path = "path/to/your/excel_file.xlsx"
extracted_text = extract_text_from_pdf(pdf_file_path)
save_to_excel(extracted_text, excel_file_path)
print("PDF data extracted and saved to Excel successfully.")
xxxxxxxxxx
# 1. Download and install java
# 2. Install python library 'tabular-py' using pip
pip install tabula-py
# If this is the first time installing java and tabula-py
# add your Java installation folder to the PATH variable.
# if you don't, this is the error message you'll get.
tabula.errors.JavaNotFoundError: `java` command is not found from this
Python process.Please ensure Java is installed and PATH is set for `java`
# 3. Import and run the tabula function on the desire page on the pdf file.
import tabula
df = tabula.read_pdf('data.pdf', pages = 3, lattice = True)[1]
xxxxxxxxxx
pip install git+https://github.com/pdftables/python-pdftables-api.git