patent_app_v1 / pages /Patent_Ingestion.py
saswatdas123's picture
Upload 6 files
fe5256f verified
raw
history blame
2.78 kB
# import required libraries
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
#from langchain.vectorstores import Chroma
from langchain_community.vectorstores import Chroma
import tensorflow_datasets as tfds
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from transformers import BartForConditionalGeneration, BartTokenizer
import textwrap
import chromadb
import streamlit as st
import sys,yaml
import uuid
import Utilities as ut
def text_summarizer(text):
initdict = ut.get_tokens()
BART_Model_Name = initdict["BART_model"]
#model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(BART_Model_Name)
tokenizer = BartTokenizer.from_pretrained(BART_Model_Name)
inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
formatted_summary = "\n".join(textwrap.wrap(summary, width=80))
return formatted_summary
def load_patentBIGdata():
initdict={}
initdict = ut.get_tokens()
embedding_model_id = initdict["embedding_model"]
chromadbpath = initdict["dataset_chroma_db"]
chromadbcollname = initdict["dataset_chroma_db_collection_name"]
embedding_model = SentenceTransformer(embedding_model_id)
chroma_client = chromadb.PersistentClient(path= chromadbpath)
collection = chroma_client.get_or_create_collection(name=chromadbcollname)
# Load the Big patent dataset
ds = load_dataset("big_patent", "a", split="validation[:1%]",trust_remote_code=True)
for record in ds.take(10):
abstract, desc = record ["abstract"], record["description"]
# Summarize to 150 words
abstract = text_summarizer(abstract)
textembeddings = embedding_model.encode(abstract).tolist()
genguid=str(uuid.uuid4())
#take 8 characters
uniqueid = genguid[:8]
# Now we will store the expert explanation field of first 10 questions from dataset into collection.
collection.add(
documents=[
abstract
],
embeddings=[textembeddings],
ids=[uniqueid]
)
#print(abstract)
st.title("Patent Ingestion - BIG Patent")
# Main chat form
with st.form("chat_form"):
submit_button = st.form_submit_button("Upload BIG Patent data...")
if submit_button:
load_patentBIGdata()
response = "BIG Patent dataset was successfully loaded"
st.write (response)