Spaces:
Sleeping
Sleeping
| import os, time | |
| from paddleocr import PaddleOCR | |
| from pdf_img_convert import PDFtoImage | |
| class OCRProcessor: | |
| def __init__(self): | |
| self.pdf_img_convert = PDFtoImage() | |
| self.ocr = PaddleOCR(use_angle_cls=True, lang='en') | |
| def perform_ocr(self, file_path, output_folder): | |
| if not os.path.exists(output_folder): | |
| os.makedirs(output_folder) | |
| images = self.pdf_img_convert.pdf_to_img_conversion(file_path,output_folder) | |
| if images: | |
| combined_text = "" | |
| for image in images: | |
| result = self.ocr.ocr(image, cls=True) | |
| for idx in range(len(result)): | |
| res = result[idx] | |
| for line in res: | |
| text = line[1][0] | |
| combined_text += f'{text} ' | |
| combined_text += '\n' | |
| return str(combined_text.strip()) |