File size: 5,704 Bytes
e963fa4
 
 
 
 
 
 
 
 
5f2768f
e963fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8cec98
e963fa4
 
 
 
 
 
 
 
 
 
 
5f2768f
e963fa4
 
 
f3b313e
e963fa4
 
 
 
 
 
951cf22
fe84c5e
882ceed
f3b313e
 
5f2768f
fe84c5e
 
 
 
 
c95667e
e963fa4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6bcd19
e963fa4
 
 
 
 
5f2768f
4110b5a
5f2768f
 
 
 
 
 
 
e963fa4
 
9f75b50
e963fa4
c95667e
e963fa4
 
 
 
 
 
 
2c58ef3
 
 
 
 
 
 
eef8834
fe84c5e
e963fa4
 
9f75b50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os 
import gradio as gr
import re
from langchain.embeddings.base import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from read_photodocument import convert_PDF_to_Text,convert_image_to_pdf
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import contextlib
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S",
)

DEVICE = 'cpu'
FILE_EXT = ['pdf','jpg','jpeg']
DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ."

MAX_NEW_TOKENS = 2048
DEFAULT_TEMPERATURE = 0.1
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 2048

embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2"
local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})


with contextlib.redirect_stdout(None):
        ocr_model = ocr_predictor(
        "db_resnet50",
        "crnn_mobilenet_v3_large",
        pretrained=True,
        assume_straight_pages=True,
    )

def loading_file():
    return "Loading..."


def summarize_data(docs,llm_model,chain_type='refine'):
    prompt_template = """
    Write a concise summary of the following text pointwise without repeating sentences:
    {text}
    CONCISE SUMMARY:
    """
    refine_template = (
        "Your job is to produce a final summary in points.\n"
        "Existing summary up to a certain point: {existing_answer}\n"
        "write the details of summary pointwise and avoid repetion."
    )
    
    prompt = PromptTemplate.from_template(prompt_template)
    refine_prompt = PromptTemplate.from_template(refine_template)
    

    chain = load_summarize_chain(llm=llm_model,
                            chain_type=chain_type,
                            question_prompt=prompt,
                            # refine_prompt=,
                            return_intermediate_steps=False,
                            input_key="input_documents",
                            output_key="output_text",
                            )
    summary = chain({"input_documents": docs}, return_only_outputs=True)
    output_text  = summary["output_text"].replace('\n',' ')
    
    consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
    
    return consice_sumary.replace('  ','\n')
    
    # matches = re.finditer(regex, output_text, re.DOTALL)
    # for matchNum, match in enumerate(matches, start=1):
    #     for groupNum in range(0, len(match.groups())):
    #         groupNum = groupNum + 1
    #         lines = match.group(groupNum).strip().split("\n")
    


def process_documents(texts,data_chunk=1000,chunk_overlap=10):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=data_chunk,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

    texts = text_splitter.split_text(texts)
    docs = [Document(page_content=txt) for txt in texts]
    return docs

def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None):
    llm = HuggingFaceHub(
        huggingfacehub_api_token =API_key ,
        repo_id=model_id, 
        model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens}
        )
    return llm


def document_loader(temperature,max_tokens,api_key,model_name,file_path):
    model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens)
    converted_txt = None
    if file_path.endswith('.pdf'):
        conversion_stats = convert_PDF_to_Text(PDF_file=file_path,ocr_model=ocr_model)
        converted_txt = conversion_stats["converted_text"]
        num_pages = conversion_stats["num_pages"]
        was_truncated = conversion_stats["truncated"]
        print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
    
    elif file_path.endswith('.jpg') or file_path.endswith('.jpeg'):
        conversion_stats = convert_image_to_pdf(file_path,ocr_model)
        converted_txt = conversion_stats["converted_text"]
        num_pages = conversion_stats["num_pages"]
        was_truncated = conversion_stats["truncated"]
        print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))

    else:
        return ("Invalid Format ....")
    if converted_txt:
        print("Document Processed ..")
        texts = process_documents(texts=converted_txt)
        lines = summarize_data(docs=texts,llm_model=model)

        return lines 
    else:
        return "Error in Processsing document "



iface = gr.Interface(
    fn = document_loader,
    inputs = [gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"),
        gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'),
        gr.Textbox(label="Add API key", type="password"),
        gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'),
        "file"
    ],
    outputs="text",
    description ="Summarize your PDF Document having Image  • HuggingFace",
)

iface.launch()