Breaking News: Grepper is joining You.com. Read the official announcement!

indentation for tesseract

Pham Thanh answered on February 25, 2023 Popularity 1/10 Helpfulness 1/10

answer indentation for tesseract

indentation for tesseract

Comment

Tip Pham Thanh 1 GREPCC

import pytesseract
from pytesseract import Output
from PIL import Image
import pandas as pd

custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'
d = pytesseract.image_to_data(Image.open(r'referto-2.jpg'), config=custom_config, output_type=Output.DICT)
df = pd.DataFrame(d)

# clean up blanks
df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')]
# sort blocks vertically
sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()
for block in sorted_blocks:
    curr = df1[df1['block_num']==block]
    sel = curr[curr.text.str.len()>3]
    char_w = (sel.width/sel.text.str.len()).mean()
    prev_par, prev_line, prev_left = 0, 0, 0
    text = ''
    for ix, ln in curr.iterrows():
        # add new line when necessary
        if prev_par != ln['par_num']:
            text += '\n'
            prev_par = ln['par_num']
            prev_line = ln['line_num']
            prev_left = 0
        elif prev_line != ln['line_num']:
            text += '\n'
            prev_line = ln['line_num']
            prev_left = 0

        added = 0  # num of spaces that should be added
        if ln['left']/char_w > prev_left + 1:
            added = int((ln['left'])/char_w) - prev_left
            text += ' ' * added 
        text += ln['text'] + ' '
        prev_left += len(ln['text']) + added + 1
    text += '\n'
    print(text)

xxxxxxxxxx

import pytesseract

from pytesseract import Output

from PIL import Image

import pandas as pd

custom_config = r'-c preserve_interword_spaces=1 --oem 1 --psm 1 -l eng+ita'

d = pytesseract.image_to_data(Image.open(r'referto-2.jpg'), config=custom_config, output_type=Output.DICT)

df = pd.DataFrame(d)

# clean up blanks

df1 = df[(df.conf!='-1')&(df.text!=' ')&(df.text!='')]

# sort blocks vertically

sorted_blocks = df1.groupby('block_num').first().sort_values('top').index.tolist()

for block in sorted_blocks:

    curr = df1[df1['block_num']==block]

    sel = curr[curr.text.str.len()>3]

    char_w = (sel.width/sel.text.str.len()).mean()

    prev_par, prev_line, prev_left = 0, 0, 0

    text = ''

    for ix, ln in curr.iterrows():

        # add new line when necessary

        if prev_par != ln['par_num']:

            text += '\n'

            prev_par = ln['par_num']

            prev_line = ln['line_num']

            prev_left = 0

        elif prev_line != ln['line_num']:

            text += '\n'

            prev_line = ln['line_num']

            prev_left = 0

        added = 0  # num of spaces that should be added

        if ln['left']/char_w > prev_left + 1:

            added = int((ln['left'])/char_w) - prev_left

            text += ' ' * added

        text += ln['text'] + ' '

        prev_left += len(ln['text']) + added + 1

    text += '\n'

    print(text)

Popularity 1/10 Helpfulness 1/10 Language python

Source: stackoverflow.com

Tags: indentation python tesseract

Link to this answer
Share Copy Link

Contributed on Feb 25 2023

Pham Thanh

0 Answers Avg Quality 2/10

indentation for tesseract

Contents

More Related Answers

indentation for tesseract

Grepper

Documentation

Social

Legal

Contact

Oops, You will need to install Grepper and log-in to perform this action.