SmartDocumentClassifier / text_extractor.py
HimankJ's picture
Update text_extractor.py
e45be25 verified
raw
history blame contribute delete
917 Bytes
import os, time
from paddleocr import PaddleOCR
from pdf_img_convert import PDFtoImage
class OCRProcessor:
def __init__(self):
self.pdf_img_convert = PDFtoImage()
self.ocr = PaddleOCR(use_angle_cls=True, lang='en')
def perform_ocr(self, file_path, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
images = self.pdf_img_convert.pdf_to_img_conversion(file_path,output_folder)
if images:
combined_text = ""
for image in images:
result = self.ocr.ocr(image, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
text = line[1][0]
combined_text += f'{text} '
combined_text += '\n'
return str(combined_text.strip())