Breaking News: Grepper is joining You.com. Read the official announcement!

pdf to text

Add Answer

Ugliest Unicorn answered on October 31, 2023 Popularity 10/10 Helpfulness 1/10

answer pdf to text

pdf to text

Comments(1)

-2

Tip Ugliest Unicorn 1 GREPCC

def convert_pdf_to_txt(pdf_file_nm):

    # If you need to assign tesseract to path
    # pytesseract.pytesseract.tesseract_cmd = r'C:\Users\xxx\AppData\Local\Tesseract-OCR\tesseract.exe'

    dir = './pdf/'
    pdf_path = dir + pdf_file_nm    
    output_filename = pdf_file_nm.replace('.pdf','') + ".txt"
    output_path = './text/'+ output_filename
    pages = convert_from_path(pdf_path)
    pg_cntr = 1
    #list = []


    sub_dir = str("images/" + pdf_path.split('/')[-1].replace('.pdf','') + "/")

    ## To ensure directory is exist / created
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    for page in pages:
        print("ok")
        filename = "pg_"+str(pg_cntr)+'_'+pdf_path.split('/')[-1].replace('.pdf','.jpg')
        page.save(sub_dir+filename)
        
        ###list.append(str(pytesseract.image_to_string(sub_dir+filename)))

        with io.open(output_path, 'a+', encoding='utf8') as f:
            f.write(str("======================================================== PAGE " + str(pg_cntr) + " ========================================================\n"))
            f.write(str(pytesseract.image_to_string(sub_dir+filename)+"\n"))
            f.write(str(devider))
        pg_cntr += 1
            
    print('1. Process to convert PDF to image completed successfully.\n')

    return output_filename

xxxxxxxxxx

def convert_pdf_to_txt(pdf_file_nm):

    # If you need to assign tesseract to path

    # pytesseract.pytesseract.tesseract_cmd = r'C:\Users\xxx\AppData\Local\Tesseract-OCR\tesseract.exe'

    dir = './pdf/'

    pdf_path = dir + pdf_file_nm

    output_filename = pdf_file_nm.replace('.pdf','') + ".txt"

    output_path = './text/'+ output_filename

    pages = convert_from_path(pdf_path)

    pg_cntr = 1

    #list = []

    sub_dir = str("images/" + pdf_path.split('/')[-1].replace('.pdf','') + "/")

    ## To ensure directory is exist / created

    if not os.path.exists(sub_dir):

        os.makedirs(sub_dir)

    for page in pages:

        print("ok")

        filename = "pg_"+str(pg_cntr)+'_'+pdf_path.split('/')[-1].replace('.pdf','.jpg')

        page.save(sub_dir+filename)

        ###list.append(str(pytesseract.image_to_string(sub_dir+filename)))

        with io.open(output_path, 'a+', encoding='utf8') as f:

            f.write(str("======================================================== PAGE " + str(pg_cntr) + " ========================================================\n"))

            f.write(str(pytesseract.image_to_string(sub_dir+filename)+"\n"))

            f.write(str(devider))

        pg_cntr += 1

    print('1. Process to convert PDF to image completed successfully.\n')

    return output_filename

Popularity 10/10 Helpfulness 1/10 Language python

Source: stackoverflow.com

Tags: pdf python text

Link to this answer
Share Copy Link

Contributed on Oct 31 2023

Ugliest Unicorn

0 Answers Avg Quality 2/10

pdf to text

Contents

More Related Answers

pdf to text

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.