def convert_pdf_to_txt(pdf_file_nm):
# If you need to assign tesseract to path
# pytesseract.pytesseract.tesseract_cmd = r'C:\Users\xxx\AppData\Local\Tesseract-OCR\tesseract.exe'
dir = './pdf/'
pdf_path = dir + pdf_file_nm
output_filename = pdf_file_nm.replace('.pdf','') + ".txt"
output_path = './text/'+ output_filename
pages = convert_from_path(pdf_path)
pg_cntr = 1
#list = []
sub_dir = str("images/" + pdf_path.split('/')[-1].replace('.pdf','') + "/")
## To ensure directory is exist / created
if not os.path.exists(sub_dir):
os.makedirs(sub_dir)
for page in pages:
print("ok")
filename = "pg_"+str(pg_cntr)+'_'+pdf_path.split('/')[-1].replace('.pdf','.jpg')
page.save(sub_dir+filename)
###list.append(str(pytesseract.image_to_string(sub_dir+filename)))
with io.open(output_path, 'a+', encoding='utf8') as f:
f.write(str("======================================================== PAGE " + str(pg_cntr) + " ========================================================\n"))
f.write(str(pytesseract.image_to_string(sub_dir+filename)+"\n"))
f.write(str(devider))
pg_cntr += 1
print('1. Process to convert PDF to image completed successfully.\n')
return output_filename