Spaces:

aipatseer
/

New-Sparse-Endpoint

Sleeping

App Files Files Community

New-Sparse-Endpoint / app.py

tejastake

Update app.py

e2ea72c verified 6 months ago

raw

history blame contribute delete

5.98 kB

	from fastapi import FastAPI, Depends, HTTPException
	from pydantic import BaseModel
	import torch
	import torch.nn.functional as F
	import logging
	import sys
	from pinecone_text.sparse import SpladeEncoder
	import re

	logger = logging.getLogger(__name__)

	logging.basicConfig(
	level=logging.getLevelName("INFO"),
	handlers=[logging.StreamHandler(sys.stdout)],
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	logging.info('Logging module started')

	def get_session():
	return True

	def is_database_online(session: bool = Depends(get_session)):
	return session

	app = FastAPI()
	# app.add_api_route("/healthz", health([is_database_online]))

	class EmbeddingModels:
	def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
	self.device = device
	logging.info(f'Using Device {self.device}')
	self.sparse_model = SpladeEncoder(device=self.device)

	def preprocessing_patent_data(self,text):
	# Removing Common tags in patent
	pattern0 = r'\b(SUBSTITUTE SHEET RULE 2 SUMMARY OF THE INVENTION\|BRIEF DESCRIPTION OF PREFERRED EMBODIMENTS\|BRIEF DESCRIPTION OF THE DRAWINGS/FIGURES\|BEST MODE FOR CARRYING OUT THE INVENTION\|BACKGROUND AND SUMMARY OF THE INVENTION\|FIELD AND BACKGROUND OF THE INVENTION\|BACKGROUND OF THE PRESENT INVENTION\|FIELD AND BACKGROUND OF INVENTION\|STAND DER TECHNIK- BACKGROUND ART\|BRIEF DESCRIPTION OF THE DRAWINGS\|DESCRIPTION OF THE RELATED ART\|BRIEF SUMMARY OF THE INVENTION\|UTILITY MODEL CLAIMS A CONTENT\|DESCRIPTION OF BACKGROUND ART\|BRIEF DESCRIPTION OF DRAWINGS\|BACKGROUND OF THE INVENTION\|BACKGROUND TO THE INVENTION\|TÉCNICA ANTERIOR- PRIOR ART\|DISCLOSURE OF THE INVENTION\|BRIEF SUMMARY OF INVENTION\|BACKGROUND OF RELATED ART\|SUMMARY OF THE DISCLOSURE\|SUMMARY OF THE INVENTIONS\|SUMMARY OF THE INVENTION\|OBJECTS OF THE INVENTION\|THE CONTENT OF INVENTION\|DISCLOSURE OF INVENTION\|Disclosure of Invention\|Complete Specification\|RELATED BACKGROUND ART\|BACKGROUND INFORMATION\|BACKGROUND TECHNOLOGY\|DETAILED DESCRIPTION\|SUMMARY OF INVENTION\|DETAILED DESCRIPTION\|PROBLEM TO BE SOLVED\|EFFECT OF INVENTION\|WHAT IS CLAIMED IS\|What is claimed is\|What is Claim is\|SUBSTITUTE SHEET\|SELECTED DRAWING\|BACK GROUND ART\|BACKGROUND ART\|Background Art\|JPO&INPIT\|CONSTITUTION\|DEFINITIONS\|Related Art\|BACKGROUND\|JPO&INPIT\|JPO&NCIPI\|COPYRIGHT\|SOLUTION\|SUMMARY)\b'
	text = re.sub(pattern0, '[SEP]', text, flags=re.IGNORECASE)
	text = ' '.join(text.split())
	# Removing all tags between Heading to /Heading and id=
	regex = r'<\sheading[^>]>(.?)<\s/\s*heading>\|<[^<]+>\|id=\"p-\d+\"\|:'
	result = re.sub(regex, '[SEP]', text, flags=re.IGNORECASE)
	# find_formula_names from pat text to exclude it from below logic regex
	chemical_list = []
	pattern1 = r'\b((?:(?:H\|He\|Li\|Be\|B\|C\|N\|O\|F\|Ne\|Na\|Mg\|Al\|Si\|P\|S\|Cl\|Ar\|K\|Ca\|Sc\|Ti\|V\|Cr\|Mn\|Fe\|Co\|Ni\|Cu\|Zn\|Ga\|Ge\|As\|Se\|Br\|Kr\|Rb\|Sr\|Y\|Zr\|Nb\|Mo\|Tc\|Ru\|Rh\|Pd\|Ag\|Cd\|In\|Sn\|Sb\|Te\|I\|Xe\|Cs\|Ba\|La\|Hf\|Ta\|W\|Re\|Os\|Ir\|Pt\|Au\|Hg\|Tl\|Pb\|Bi\|Po\|At\|Rn\|Fr\|Ra\|Ac\|Rf\|Db\|Sg\|Bh\|Hs\|Mt\|Ds\|Rg\|Cn\|Nh\|Fl\|Mc\|Lv\|Ts\|Og\|Ce\|Pr\|Nd\|Pm\|Sm\|Eu\|Gd\|Tb\|Dy\|Ho\|Er\|Tm\|Yb\|Lu\|Th\|Pa\|U\|Np\|Pu\|Am\|Cm\|Bk\|Cf\|Es\|Fm\|Md\|No\|Lr)\d*)+)\b'

	formula_names = re.findall(pattern1, result)
	for formula in formula_names:
	if len(formula)>=2:
	chemical_list.append(formula)
	# print("chemical_list:", chemical_list)

	# Remove numbers and alphanum inside brackets excluding chemical forms
	pattern2 = r"\((?![A-Za-z]+\))[\w\d\s,-]+\)\|\([A-Za-z]\)"
	def keep_strings(text):
	matched = text.group(0)
	if any(item in matched for item in chemical_list):
	return matched
	return ' '
	cleaned_text = re.sub(pattern2, keep_strings, result)
	cleaned_text = ' '.join(cleaned_text.split())
	cleaned_text= re.sub("(\[SEP\]+\s*)+", ' ', cleaned_text, flags=re.IGNORECASE)
	# below new logic to remove chemical compounds (eg.chemical- polymerizable compounds)
	p_text2=re.sub('[\—\-\═\=]', ' ', cleaned_text)
	pattern1 = r'\b((?:(?:H\|He\|Li\|Be\|B\|C\|N\|O\|F\|Ne\|Na\|Mg\|Al\|Si\|P\|S\|Cl\|Ar\|K\|Ca\|Sc\|Ti\|V\|Cr\|Mn\|Fe\|Co\|Ni\|Cu\|Zn\|Ga\|Ge\|As\|Se\|Br\|Kr\|Rb\|Sr\|Y\|Zr\|Nb\|Mo\|Tc\|Ru\|Rh\|Pd\|Ag\|Cd\|In\|Sn\|Sb\|Te\|I\|Xe\|Cs\|Ba\|La\|Hf\|Ta\|W\|Re\|Os\|Ir\|Pt\|Au\|Hg\|Tl\|Pb\|Bi\|Po\|At\|Rn\|Fr\|Ra\|Ac\|Rf\|Db\|Sg\|Bh\|Hs\|Mt\|Ds\|Rg\|Cn\|Nh\|Fl\|Mc\|Lv\|Ts\|Og\|Ce\|Pr\|Nd\|Pm\|Sm\|Eu\|Gd\|Tb\|Dy\|Ho\|Er\|Tm\|Yb\|Lu\|Th\|Pa\|U\|Np\|Pu\|Am\|Cm\|Bk\|Cf\|Es\|Fm\|Md\|No\|Lr)\d*)+)\b'
	cleaned_text = re.sub(pattern1, "", p_text2)
	cleaned_text = re.sub(' ,+\|, +', ' ', cleaned_text)
	cleaned_text = re.sub(' +', ' ', cleaned_text)
	cleaned_text = re.sub('\.+', '.', cleaned_text)
	cleaned_text = re.sub('[0-9] [0-9] +', ' ', cleaned_text)
	cleaned_text = re.sub('( )', ' ', cleaned_text)
	cleaned_text=cleaned_text.strip()
	return cleaned_text

	def get_single_sparse_text_embedding(self, df_chunk):
	df_chunk = self.preprocessing_patent_data(df_chunk)
	txt_sp = self.sparse_model.encode_documents(df_chunk)

	# tensor = torch.tensor(txt_sp['values'])
	# normalized_tensor = F.normalize(tensor, p=2.0, dim=0, eps=1e-12)
	# values = normalized_tensor.tolist()

	# # Update the sparse_vector with normalized values
	# normalized_sparse_vector = {
	# 'indices': txt_sp['indices'],
	# 'values': values
	# }
	# return normalized_sparse_vector
	return txt_sp


	model = EmbeddingModels()

	class TextInput(BaseModel):
	text: str


	@app.post("/sparse/")
	async def embed_text(item: TextInput):
	try:
	logging.info(f'Received text for embedding: {item.text}')
	embeddings = model.get_single_sparse_text_embedding(item.text)
	logging.info('Embedding process completed')
	return embeddings
	except Exception as e:
	logging.error(f'Error during embedding process: {e}')
	raise HTTPException(status_code=500, detail=str(e))