File size: 4,031 Bytes
846b4a5
 
 
 
 
 
 
 
 
 
b07041b
 
846b4a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b07041b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import re
import docx
import pytesseract
from nltk.tokenize import sent_tokenize, word_tokenize
from PyPDF2 import PdfReader
from pdf2image import convert_from_path


class OCRService:
    def __init__(self, LLAMAPARSE_API_KEY):
        self.llama_parse_key = LLAMAPARSE_API_KEY
        return

    def extract_ocrless_pdf(self, filepath):
        reader = PdfReader(filepath)
        extracted_text = ""
        for page in reader.pages:
            text = page.extract_text()
            extracted_text += " "
            extracted_text += text

        return extracted_text

    def extract_text_from_pdf(self, filepath):
        images = convert_from_path(filepath, thread_count=4)
        full_text = []
        #config = (r"--oem 2 --psm 7")
        for image_idx, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            #text = pytesseract.image_to_string(image, config=config)
            full_text.append(text)
        return full_text

    def extract_text_from_document(self, filepath):
        file_ext = os.path.splitext(filepath)[-1]
        if file_ext in [".pdf"]:
            text_to_process = self.extract_text_from_pdf(filepath)
            text_joined = " ".join(text_to_process)
            #with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
            #    file.writelines(text_to_process)
        elif file_ext in [".doc", ".DOC", ".docx", ".DOCX"]:
            doc_content = docx.Document(filepath)
            text_to_process = [i.text for i in doc_content.paragraphs]
            text_joined = " \n ".join(text_to_process)
            #with open(f"{os.path.splitext(filepath)[0]}.txt", "w") as file:
            #    file.write(text_joined)
        elif file_ext in [".txt"]:
            file = open(f"{os.path.splitext(filepath)[0]}.txt", encoding="utf8")
            text_joined = file.read()

        return text_joined

    def preprocess_document(self, document):
        document = document.replace(r'\n+', "\n")
        #document = re.sub(r"\s+", " ", document)
        document = re.sub("“", r"\"", document)
        document = re.sub("”", r"\"", document)
        document = re.sub(r"\\\"", "\"", document)

        return document

    def chunk_document(self, text, k=1500):
        sentences = sent_tokenize(text)
        words = word_tokenize(text)

        chunks = []
        current_chunk = []
        current_word_count = 0

        for sentence in sentences:
            sentence_words = word_tokenize(sentence)
            if current_word_count + len(sentence_words) <= k:
                current_chunk.append(sentence)
                current_word_count += len(sentence_words)
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                current_word_count = len(sentence_words)

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        for id, chunk in enumerate(chunks):
            if len(chunk.split()) < 2:
                del chunks[id]

        return chunks

    def llama_parse_ocr(self, file_path):
        llamaparse_url = 'https://api.cloud.llamaindex.ai/api/parsing/upload'
        headers = {
            'accept': 'application/json',
            'Authorization': f'Bearer {self.llama_parse_key}'
        }
        files = {
            'file': (file_path, open(file_path, 'rb'), 'application/pdf')
        }
        response = requests.post(llamaparse_url, headers=headers, files=files)
        print(response.json())  # If you want to print the JSON response

        job_id = response.json()["id"]
        result_type = "markdown"

        llamaparse_result_url = f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/{result_type}"

        # check for the result until its ready
        while True:
            response = requests.get(llamaparse_result_url, headers=headers)
            if response.status_code == 200:
                break


        return response.json()['markdown']