xxxxxxxxxx
# pip3 install pdfplumber
import pdfplumber
# a single page
with pdfplumber.open(r'test.pdf') as pdf:
first_page = pdf.pages[-0]
print(first_page.extract_text())
# for every page
# with pdfplumber.open(r'test.pdf') as pdf:
# for pages in pdf.pages:
# print(pages.extract_text())
xxxxxxxxxx
# using PyMuPDF
import sys, fitz
fname = sys.argv[1] # get document filename
doc = fitz.open(fname) # open document
out = open(fname + ".txt", "wb") # open text output
for page in doc: # iterate the document pages
text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
out.write(text) # write text of page
out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()
xxxxxxxxxx
import docx2txt # for extracting text from .docx files
import PyPDF2 # for extracting text from PDF files
import textract # for extracting text from other file formats
def extract_text_from_file(file_path):
file_extension = file_path.split('.')[-1].lower()
if file_extension == 'docx':
text = docx2txt.process(file_path)
elif file_extension == 'pdf':
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ''
for page in pdf_reader.pages:
text += page.extract_text()
else:
text = textract.process(file_path).decode('utf-8')
return text
# Example usage:
file_path = 'path/to/your/file.docx' # Replace with your file path
extracted_text = extract_text_from_file(file_path)
print(extracted_text)