Samarth991's picture
Update app.py
4110b5a verified
raw
history blame
5.7 kB
import os
import gradio as gr
import re
from langchain.embeddings.base import Embeddings
from typing import List
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from read_photodocument import convert_PDF_to_Text,convert_image_to_pdf
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import contextlib
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
import logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%m/%d/%Y %I:%M:%S",
)
DEVICE = 'cpu'
FILE_EXT = ['pdf','jpg','jpeg']
DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ."
MAX_NEW_TOKENS = 2048
DEFAULT_TEMPERATURE = 0.1
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = 2048
embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2"
local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
with contextlib.redirect_stdout(None):
ocr_model = ocr_predictor(
"db_resnet50",
"crnn_mobilenet_v3_large",
pretrained=True,
assume_straight_pages=True,
)
def loading_file():
return "Loading..."
def summarize_data(docs,llm_model,chain_type='refine'):
prompt_template = """
Write a concise summary of the following text pointwise without repeating sentences:
{text}
CONCISE SUMMARY:
"""
refine_template = (
"Your job is to produce a final summary in points.\n"
"Existing summary up to a certain point: {existing_answer}\n"
"write the details of summary pointwise and avoid repetion."
)
prompt = PromptTemplate.from_template(prompt_template)
refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(llm=llm_model,
chain_type=chain_type,
question_prompt=prompt,
# refine_prompt=,
return_intermediate_steps=False,
input_key="input_documents",
output_key="output_text",
)
summary = chain({"input_documents": docs}, return_only_outputs=True)
output_text = summary["output_text"].replace('\n',' ')
consice_sumary = re.search("CONCISE SUMMARY:.*\.*$", output_text).group(0)
return consice_sumary.replace(' ','\n')
# matches = re.finditer(regex, output_text, re.DOTALL)
# for matchNum, match in enumerate(matches, start=1):
# for groupNum in range(0, len(match.groups())):
# groupNum = groupNum + 1
# lines = match.group(groupNum).strip().split("\n")
def process_documents(texts,data_chunk=1000,chunk_overlap=10):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=data_chunk,
chunk_overlap=chunk_overlap,
length_function=len
)
texts = text_splitter.split_text(texts)
docs = [Document(page_content=txt) for txt in texts]
return docs
def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None):
llm = HuggingFaceHub(
huggingfacehub_api_token =API_key ,
repo_id=model_id,
model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens}
)
return llm
def document_loader(temperature,max_tokens,api_key,model_name,file_path):
model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens)
converted_txt = None
if file_path.endswith('.pdf'):
conversion_stats = convert_PDF_to_Text(PDF_file=file_path,ocr_model=ocr_model)
converted_txt = conversion_stats["converted_text"]
num_pages = conversion_stats["num_pages"]
was_truncated = conversion_stats["truncated"]
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
elif file_path.endswith('.jpg') or file_path.endswith('.jpeg'):
conversion_stats = convert_image_to_pdf(file_path,ocr_model)
converted_txt = conversion_stats["converted_text"]
num_pages = conversion_stats["num_pages"]
was_truncated = conversion_stats["truncated"]
print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
else:
return ("Invalid Format ....")
if converted_txt:
print("Document Processed ..")
texts = process_documents(texts=converted_txt)
lines = summarize_data(docs=texts,llm_model=model)
return lines
else:
return "Error in Processsing document "
iface = gr.Interface(
fn = document_loader,
inputs = [gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"),
gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'),
gr.Textbox(label="Add API key", type="password"),
gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'),
"file"
],
outputs="text",
description ="Summarize your PDF Document having Image • HuggingFace",
)
iface.launch()