Spaces:

saswatdas123
/

patent_app_v1

Sleeping

App Files Files Community

patent_app_v1 / pages /Patent_Ingestion.py

saswatdas123

Upload 6 files

fe5256f verified 5 months ago

raw

history blame

2.78 kB

	# import required libraries
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.llms import HuggingFaceHub
	#from langchain.vectorstores import Chroma
	from langchain_community.vectorstores import Chroma
	import tensorflow_datasets as tfds
	from sentence_transformers import SentenceTransformer
	from datasets import load_dataset
	from transformers import BartForConditionalGeneration, BartTokenizer
	import textwrap
	import chromadb
	import streamlit as st
	import sys,yaml
	import uuid
	import Utilities as ut


	def text_summarizer(text):
	initdict = ut.get_tokens()
	BART_Model_Name = initdict["BART_model"]
	#model_name = "facebook/bart-large-cnn"
	model = BartForConditionalGeneration.from_pretrained(BART_Model_Name)
	tokenizer = BartTokenizer.from_pretrained(BART_Model_Name)

	inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
	summary_ids = model.generate(inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	formatted_summary = "\n".join(textwrap.wrap(summary, width=80))

	return formatted_summary

	def load_patentBIGdata():

	initdict={}
	initdict = ut.get_tokens()

	embedding_model_id = initdict["embedding_model"]
	chromadbpath = initdict["dataset_chroma_db"]
	chromadbcollname = initdict["dataset_chroma_db_collection_name"]

	embedding_model = SentenceTransformer(embedding_model_id)

	chroma_client = chromadb.PersistentClient(path= chromadbpath)

	collection = chroma_client.get_or_create_collection(name=chromadbcollname)


	# Load the Big patent dataset
	ds = load_dataset("big_patent", "a", split="validation[:1%]",trust_remote_code=True)

	for record in ds.take(10):
	abstract, desc = record ["abstract"], record["description"]
	# Summarize to 150 words
	abstract = text_summarizer(abstract)
	textembeddings = embedding_model.encode(abstract).tolist()

	genguid=str(uuid.uuid4())
	#take 8 characters
	uniqueid = genguid[:8]
	# Now we will store the expert explanation field of first 10 questions from dataset into collection.
	collection.add(
	documents=[
	abstract
	],
	embeddings=[textembeddings],
	ids=[uniqueid]
	)
	#print(abstract)

	st.title("Patent Ingestion - BIG Patent")

	# Main chat form
	with st.form("chat_form"):

	submit_button = st.form_submit_button("Upload BIG Patent data...")

	if submit_button:
	load_patentBIGdata()
	response = "BIG Patent dataset was successfully loaded"

	st.write (response)