xxxxxxxxxx
import pytesseract
from pdf2image import convert_from_path
import glob
pdfs = glob.glob(r"yourPath\*.pdf")
for pdf_path in pdfs:
pages = convert_from_path(pdf_path, 500)
for pageNum,imgBlob in enumerate(pages):
text = pytesseract.image_to_string(imgBlob,lang='eng')
with open(f'{pdf_path[:-4]}_page{pageNum}.txt', 'w') as the_file:
the_file.write(text)
xxxxxxxxxx
import cv2
import pytesseract
img = cv2.imread('/Users/user1/Desktop/folder1/pdf1.pdf')
text = pytesseract.image_to_string(img)
print(text)