Spaces:

Samarth991
/

Summarize-PhotoDocument

Sleeping

App Files Files Community

Samarth991 commited on Mar 27

Commit

e963fa4

•

1 Parent(s): 136eadc

adding app

Browse files

Files changed (3) hide show

app.py +192 -0
read_photodocument.py +381 -0
requirements.txt +14 -0

app.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import gradio as gr
+import re
+from langchain.vectorstores import FAISS
+from langchain.embeddings.base import Embeddings
+from typing import List
+from sentence_transformers import SentenceTransformer
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.prompts import PromptTemplate
+from langchain_community.llms.huggingface_hub import HuggingFaceHub
+from read_photodocument import convert_PDF_to_Text
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+import contextlib
+from langchain.schema import Document
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.chains.summarize import load_summarize_chain
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+    datefmt="%m/%d/%Y %I:%M:%S",
+)
+DEVICE = 'cpu'
+FILE_EXT = ['pdf','jpg','jpeg']
+DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ."
+MAX_NEW_TOKENS = 2048
+DEFAULT_TEMPERATURE = 0.1
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = 2048
+embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2"
+local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
+with contextlib.redirect_stdout(None):
+        ocr_model = ocr_predictor(
+        "db_resnet50",
+        "crnn_mobilenet_v3_large",
+        pretrained=True,
+        assume_straight_pages=True,
+    )
+def loading_file():
+    return "Loading..."
+def summarize_data(docs,llm_model,chain_type='refine'):
+    prompt_template = """
+    Write a concise summary of the following pointwise avoid repetion:
+    {text}
+    CONCISE SUMMARY:
+    """
+    refine_template = (
+        "Your job is to produce a final summary in points.\n"
+        "Existing summary up to a certain point: {existing_answer}\n"
+        "write the details of summary pointwise and avoid repetion."
+    )
+    prompt = PromptTemplate.from_template(prompt_template)
+    refine_prompt = PromptTemplate.from_template(refine_template)
+    chain = load_summarize_chain(llm=llm_model,
+                            chain_type=chain_type,
+                            # question_prompt=prompt,
+                            # refine_prompt=,
+                            return_intermediate_steps=False,
+                            input_key="input_documents",
+                            output_key="output_text",
+                            )
+    summary = chain({"input_documents": docs}, return_only_outputs=True)
+    output_text  = summary["output_text"].strip()
+    regex = r"CONCISE SUMMARY:(.*)"
+    matches = re.finditer(regex, output_text, re.DOTALL)
+    for matchNum, match in enumerate(matches, start=1):
+        for groupNum in range(0, len(match.groups())):
+            groupNum = groupNum + 1
+            lines = match.group(groupNum).strip().split("\n")
+    return lines
+def process_documents(texts,data_chunk=1000,chunk_overlap=10):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=data_chunk,
+        chunk_overlap=chunk_overlap,
+        length_function=len
+    )
+    texts = text_splitter.split_text(texts)
+    docs = [Document(page_content=txt) for txt in texts]
+    return docs
+def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None):
+    llm = HuggingFaceHub(
+        huggingfacehub_api_token =API_key ,
+        repo_id=model_id,
+        model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens}
+        )
+    return llm
+def document_loader(temperature,max_tokens,api_key,model_name,file_path):
+    model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens)
+    converted_txt = None
+    if file_path.endswith('.pdf'):
+        conversion_stats = convert_PDF_to_Text(document_file=file_path,ocr_model=ocr_model)
+        converted_txt = conversion_stats["converted_text"]
+        num_pages = conversion_stats["num_pages"]
+        was_truncated = conversion_stats["truncated"]
+        print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
+    if converted_txt:
+        print("Document Processed ..")
+        texts = process_documents(documents=converted_txt)
+        lines = summarize_data(docs=texts,llm_model=model)
+        return lines
+    else:
+        return "Error in Processsing document "
+iface = gr.Interface(
+    fn= document_loader,inputs = [
+    gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"),
+    gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'),
+    gr.Textbox(label="Add API key", type="password"),
+    gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'),
+    "file"
+    ]
+    ouputs="text",
+    description ="Summarize your PDF Document having Image • HuggingFace",
+)
+iface.launch()
+# with gr.Blocks(css=css) as demo:
+#     with gr.Column(elem_id="col-container"):
+#         gr.HTML(title)
+#     with gr.Group():
+#         chatbot = gr.Chatbot(height=300)
+#     with gr.Row():
+#         sumarize_btn = gr.Button(value="Summarize", variant="primary", scale = 1)
+#         clean_chat_btn =  gr.Button("Delete Chat")
+#     with gr.Column():
+#         LLM_option = gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model Selection',info='LLM Service')
+#         with gr.Column():
+#             with gr.Box():
+#                 file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select type of file to upload !")
+#                 pdf_doc = gr.File(label="Upload File", file_types=FILE_EXT, type="file")
+#                 with gr.Accordion(label='Advanced options', open=False):
+#                     max_new_tokens = gr.Slider(
+#                         label='Max new tokens',
+#                         minimum=512,
+#                         maximum=MAX_NEW_TOKENS,
+#                         step=1024,
+#                         value=DEFAULT_MAX_NEW_TOKENS,
+#                         )
+#                     temperature = gr.Slider(
+#                     label='Temperature',
+#                     minimum=0.01,
+#                     maximum=1.0,
+#                     step=0.05,
+#                     value=DEFAULT_TEMPERATURE,
+#                     )
+#                 with gr.Row():
+#                     langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False)
+#                     load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width = False)
+#         # chatbot = gr.Chatbot()l̥
+#         # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
+#         # submit_button = gr.Button("Send Message")
+#     if pdf_doc:
+#         load_pdf.click(loading_file, None, langchain_status, queue=False)
+#         load_pdf.click(document_loader, inputs=[pdf_doc,file_extension,temperature,max_new_tokens], outputs=[langchain_status], queue=False)
+#     #question.submit(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
+#     #submit_btn.click(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
+#     sumarize_btn.click()
+#     # submit_btn.then(chatf.highlight_found_text, [chatbot, sources], [sources])
+#     clean_chat_btn.click(clear_chat, [], chatbot)
+# demo.launch()

read_photodocument.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import logging
+from pathlib import Path
+import os
+import pprint as pp
+import re
+import shutil
+import time
+from datetime import date, datetime
+from os.path import basename, dirname, join
+from pathlib import Path
+from cleantext import clean
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from libretranslatepy import LibreTranslateAPI
+from natsort import natsorted
+from spellchecker import SpellChecker
+from tqdm.auto import tqdm
+import nltk
+import contextlib
+nltk.download("stopwords")  # TODO=find where this requirement originates from
+def simple_rename(filepath, target_ext=".txt"):
+    _fp = Path(filepath)
+    basename = _fp.stem
+    return f"OCR_{basename}_{target_ext}"
+def rm_local_text_files(name_contains="RESULT_"):
+    """
+    rm_local_text_files - remove local text files
+    Args:
+        name_contains (str, optional): [description]. Defaults to "OCR_".
+    """
+    files = [
+        f
+        for f in Path.cwd().iterdir()
+        if f.is_file() and f.suffix == ".txt" and name_contains in f.name
+    ]
+    logging.info(f"removing {len(files)} text files")
+    for f in files:
+        os.remove(f)
+    logging.info("done")
+def corr(
+    s: str,
+    add_space_when_numerics=False,
+    exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
+) -> str:
+    """corrects spacing in a string
+    Args:
+        s (str): the string to correct
+        add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
+        exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
+    Returns:
+        str: the corrected string
+    """
+    if add_space_when_numerics:
+        s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)
+    s = re.sub(r"\s+", " ", s)
+    s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)
+    # fix space before apostrophe
+    s = re.sub(r"\s\'", r"'", s)
+    # fix space after apostrophe
+    s = re.sub(r"'\s", r"'", s)
+    # fix space before comma
+    s = re.sub(r"\s,", r",", s)
+    for e in exceptions:
+        expected_sub = re.sub(r"\s", "", e)
+        s = s.replace(expected_sub, e)
+    return s
+def fix_punct_spaces(string):
+    """
+    fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
+    Parameters
+    ----------
+    string : str, required, input string to be corrected
+    Returns
+    -------
+    str, corrected string
+    """
+    fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
+    string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
+    string = string.replace(" ' ", "'")
+    string = string.replace(' " ', '"')
+    return string.strip()
+def clean_OCR(ugly_text: str):
+    """
+    clean_OCR - clean the OCR text files.
+    Parameters
+    ----------
+    ugly_text : str, required, input string to be cleaned
+    Returns
+    -------
+    str, cleaned string
+    """
+    # Remove all the newlines.
+    cleaned_text = ugly_text.replace("\n", " ")
+    # Remove all the tabs.
+    cleaned_text = cleaned_text.replace("\t", " ")
+    # Remove all the double spaces.
+    cleaned_text = cleaned_text.replace("  ", " ")
+    # Remove all the spaces at the beginning of the text.
+    cleaned_text = cleaned_text.lstrip()
+    # remove all instances of "- " and " - "
+    cleaned_text = cleaned_text.replace("- ", "")
+    cleaned_text = cleaned_text.replace(" -", "")
+    return fix_punct_spaces(cleaned_text)
+def move2completed(from_dir, filename, new_folder="completed", verbose=False):
+    # this is the better version
+    old_filepath = join(from_dir, filename)
+    new_filedirectory = join(from_dir, new_folder)
+    if not os.path.isdir(new_filedirectory):
+        os.mkdir(new_filedirectory)
+        if verbose:
+            print("created new directory for files at: \n", new_filedirectory)
+    new_filepath = join(new_filedirectory, filename)
+    try:
+        shutil.move(old_filepath, new_filepath)
+        logging.info("successfully moved the file {} to */completed.".format(filename))
+    except:
+        logging.info(
+            "ERROR! unable to move file to \n{}. Please investigate".format(
+                new_filepath
+            )
+        )
+"""## pdf2text functions
+"""
+custom_replace_list = {
+    "t0": "to",
+    "'$": "'s",
+    ",,": ", ",
+    "_ ": " ",
+    " '": "'",
+}
+replace_corr_exceptions = {
+    "i. e.": "i.e.",
+    "e. g.": "e.g.",
+    "e. g": "e.g.",
+    " ,": ",",
+}
+spell = SpellChecker()
+def check_word_spelling(word: str) -> bool:
+    """
+    check_word_spelling - check the spelling of a word
+    Args:
+        word (str): word to check
+    Returns:
+        bool: True if word is spelled correctly, False if not
+    """
+    misspelled = spell.unknown([word])
+    return len(misspelled) == 0
+def eval_and_replace(text: str, match_token: str = "- ") -> str:
+    """
+    eval_and_replace  - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
+    Args:
+        text (str): text to evaluate
+        match_token (str, optional): token to replace. Defaults to "- ".
+    Returns:
+        str:  text with replaced tokens
+    """
+    try:
+        if match_token not in text:
+            return text
+        else:
+            while True:
+                full_before_text = text.split(match_token, maxsplit=1)[0]
+                before_text = [
+                    char for char in full_before_text.split()[-1] if char.isalpha()
+                ]
+                before_text = "".join(before_text)
+                full_after_text = text.split(match_token, maxsplit=1)[-1]
+                after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
+                after_text = "".join(after_text)
+                full_text = before_text + after_text
+                if check_word_spelling(full_text):
+                    text = full_before_text + full_after_text
+                else:
+                    text = full_before_text + " " + full_after_text
+                if match_token not in text:
+                    break
+    except Exception as e:
+        logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
+    return text
+def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
+    """
+    cleantxt_ocr - clean text from OCR
+    Args:
+        ugly_text (str): text to clean
+        lower (bool, optional): _description_. Defaults to False.
+        lang (str, optional): _description_. Defaults to "en".
+    Returns:
+        str: cleaned text
+    """
+    # a wrapper for clean text with options different than default
+    # https://pypi.org/project/clean-text/
+    cleaned_text = clean(
+        ugly_text,
+        fix_unicode=True,  # fix various unicode errors
+        to_ascii=True,  # transliterate to closest ASCII representation
+        lower=lower,  # lowercase text
+        no_line_breaks=True,  # fully strip line breaks as opposed to only normalizing them
+        no_urls=True,  # replace all URLs with a special token
+        no_emails=False,  # replace all email addresses with a special token
+        no_phone_numbers=False,  # replace all phone numbers with a special token
+        no_numbers=False,  # replace all numbers with a special token
+        no_digits=False,  # replace all digits with a special token
+        no_currency_symbols=False,  # replace all currency symbols with a special token
+        no_punct=False,  # remove punctuations
+        replace_with_punct="",  # instead of removing punctuations you may replace them
+        replace_with_url="<URL>",
+        replace_with_email="<EMAIL>",
+        replace_with_phone_number="<PHONE>",
+        replace_with_number="<NUM>",
+        replace_with_digit="0",
+        replace_with_currency_symbol="<CUR>",
+        lang=lang,  # set to 'de' for German special handling
+    )
+    return cleaned_text
+def format_ocr_out(OCR_data):
+    if isinstance(OCR_data, list):
+        text = " ".join(OCR_data)
+    else:
+        text = str(OCR_data)
+    _clean = cleantxt_ocr(text)
+    return corr(_clean)
+def postprocess(text: str) -> str:
+    """to be used after recombining the lines"""
+    proc = corr(cleantxt_ocr(text))
+    for k, v in custom_replace_list.items():
+        proc = proc.replace(str(k), str(v))
+    proc = corr(proc)
+    for k, v in replace_corr_exceptions.items():
+        proc = proc.replace(str(k), str(v))
+    return eval_and_replace(proc)
+def result2text(result, as_text=False):
+    """Convert OCR result to text"""
+    full_doc = []
+    for i, page in enumerate(result.pages, start=1):
+        text = ""
+        for block in page.blocks:
+            text += "\n\t"
+            for line in block.lines:
+                for word in line.words:
+                    # print(dir(word))
+                    text += word.value + " "
+        full_doc.append(text)
+    return "\n".join(full_doc) if as_text else full_doc
+def convert_PDF_to_Text(
+    PDF_file,
+    ocr_model=None,
+    max_pages: int = 20,
+):
+    st = time.perf_counter()
+    PDF_file = Path(PDF_file)
+    ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
+    logging.info(f"starting OCR on {PDF_file.name}")
+    doc = DocumentFile.from_pdf(PDF_file)
+    truncated = False
+    if len(doc) > max_pages:
+        logging.warning(
+            f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
+        )
+        doc = doc[:max_pages]
+        truncated = True
+    # Analyze
+    logging.info(f"running OCR on {len(doc)} pages")
+    result = ocr_model(doc)
+    raw_text = result2text(result)
+    proc_text = [format_ocr_out(r) for r in raw_text]
+    fin_text = [postprocess(t) for t in proc_text]
+    ocr_results = "\n\n".join(fin_text)
+    fn_rt = time.perf_counter() - st
+    logging.info("OCR complete")
+    results_dict = {
+        "num_pages": len(doc),
+        "runtime": round(fn_rt, 2),
+        "date": str(date.today()),
+        "converted_text": ocr_results,
+        "truncated": truncated,
+        "length": len(ocr_results),
+    }
+    return results_dict
+# @title translation functions
+lt = LibreTranslateAPI("https://translate.astian.org/")
+def translate_text(text, source_l, target_l="en"):
+    return str(lt.translate(text, source_l, target_l))
+def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
+    """translate a document from lang_start to lang_end
+        {'code': 'en', 'name': 'English'},
+    {'code': 'fr', 'name': 'French'},
+    {'code': 'de', 'name': 'German'},
+    {'code': 'it', 'name': 'Italian'},"""
+    src_folder = dirname(filepath)
+    src_folder = Path(src_folder)
+    trgt_folder = src_folder / f"translated_{lang_end}"
+    trgt_folder.mkdir(exist_ok=True)
+    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
+        foreign_t = f.readlines()
+    in_name = basename(filepath)
+    translated_doc = []
+    for line in tqdm(
+        foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
+    ):
+        translated_line = translate_text(line, lang_start, lang_end)
+        translated_doc.append(translated_line)
+    t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
+    out_path = join(trgt_folder, t_out_name)
+    with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
+        f_o.writelines(translated_doc)
+    if verbose:
+        print("finished translating the document! - ", datetime.now())
+    return out_path

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio==3.0.11
+tiktoken
+chromadb
+langchain
+unstructured
+unstructured[local-inference]
+transformers
+torch
+faiss-cpu
+sentence-transformers
+chromadb
+bitsandbytes
+accelerate
+doctr