DeepSoft-Tech commited on
Commit
7ef595f
1 Parent(s): 3f83692

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +182 -0
  3. requirements (1).txt +8 -0
  4. wikicat_all.csv +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wikicat_all.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from get_pat_data import Patent_DataCreator
4
+ from datasets import load_dataset
5
+ import re
6
+ import boto3
7
+ import time
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ import pandas as pd
11
+ import pinecone
12
+ import torch.nn.functional as F
13
+ from torch import Tensor
14
+ from transformers import AutoTokenizer, AutoModel
15
+ from keybert import KeyBERT
16
+ from keyphrase_vectorizers import KeyphraseCountVectorizer
17
+ kw_model=KeyBERT(model='AI-Growth-Lab/PatentSBERTa')
18
+
19
+ s3 = boto3.resource('s3',
20
+ region_name='us-east-1',
21
+ aws_access_key_id='AKIA3VGKPNV5NSVBJWEE',
22
+ aws_secret_access_key='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')
23
+
24
+ INDEX_API_KEY='b33ddf5d-5b1a-4d0e-9a3f-572008563791'
25
+ INDEX_DIMENSION=768
26
+ INDEX_ENV='gcp-starter'
27
+ INDEX_NAME='wiki-index'
28
+
29
+ # getting Pinecone credntials
30
+ # INDEX_DIMENSION=768
31
+ # logging.info(f"Index dimensions are:{INDEX_DIMENSION}")
32
+ pinecone.init(api_key=INDEX_API_KEY, environment=INDEX_ENV)
33
+ index = pinecone.Index(index_name=INDEX_NAME )
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base')
36
+ model = AutoModel.from_pretrained('intfloat/e5-base')
37
+
38
+ # data=pd.read_csv("wikicat_all.csv")
39
+
40
+ def get_pat_text(pnkc_no):
41
+ pat_data=Patent_DataCreator(pnkc_no)
42
+ bib_key,pnkc_without_kindcode,pnkc_suffix=pat_data.get_bib_key()
43
+ bib_bucket=pat_data.get_bib_bucket()
44
+ bib_data=pat_data.get_bib_data(s3)
45
+ claims_data=pat_data.get_claims_data(s3)
46
+ desc_data=pat_data.get_desc_data(s3)
47
+ df1,df2,df3=pat_data.get_patent_dfs()
48
+ dataset=pat_data.get_patent_dataset()
49
+
50
+ Title=dataset[1]['Title'][0]
51
+ Abstract=dataset[1]['Abstract'][0]
52
+ Claims=dataset[1]['Claims'][0]
53
+ Description=dataset[1]['Description'][0]
54
+ # SOI=dataset[1]['SOI'][0]
55
+
56
+ pat_text= Title+Abstract
57
+ return pat_text
58
+
59
+
60
+ # Function to fetch categories, title, and related text from a Wikipedia page
61
+ def fetch_wikipedia_data(article_title):
62
+ url = f"https://en.wikipedia.org/wiki/{article_title.replace(' ', '_')}"
63
+
64
+ response = requests.get(url)
65
+
66
+ if response.status_code == 200:
67
+ soup = BeautifulSoup(response.text, 'html.parser')
68
+
69
+ # Find the categories section at the bottom of the page
70
+ categories_section = soup.find("div", {"class": "mw-normal-catlinks"})
71
+
72
+ if categories_section:
73
+ # Extract individual categories
74
+ categories = [cat.text for cat in categories_section.find("ul").find_all("li")]
75
+
76
+ # Extract the title
77
+ title = article_title
78
+
79
+ return {"title": title, "categories": categories}
80
+
81
+ return None
82
+
83
+ def get_wiki_category_aprch_1(pat_text):
84
+ print(pat_text)
85
+ keywords=kw_model.extract_keywords(pat_text,keyphrase_ngram_range=(1, 3),top_n=15,vectorizer=KeyphraseCountVectorizer())
86
+ titles=[]
87
+ for i in range(len(keywords)):
88
+ title=keywords[i][0]
89
+ titles.append(title)
90
+ data = []
91
+ for i in titles:
92
+ results = fetch_wikipedia_data(i)
93
+ data.append(results)
94
+ cats=[]
95
+ for i in range(len(data)):
96
+ if data[i] is not None:
97
+ cat=data[i]['categories']
98
+ cats.append(cat)
99
+ result=[j for i in cats for j in i]
100
+ res = [i for n, i in enumerate(result) if i not in result[:n]]
101
+ return titles,res
102
+
103
+ # def get_wiki_category_aprch_2(pat_text):
104
+ # print(pat_text)
105
+ # keywords=kw_model.extract_keywords(pat_text,keyphrase_ngram_range=(1, 3),top_n=10,vectorizer=KeyphraseCountVectorizer())
106
+ # titles=[]
107
+ # for i in range(len(keywords)):
108
+ # title=keywords[i][0]
109
+ # titles.append(title)
110
+ # data = []
111
+ # for i in titles:
112
+ # results = fetch_wikipedia_data(i)
113
+ # data.append(results)
114
+ # cats=[]
115
+ # for i in range(len(data)):
116
+ # if data[i] is not None:
117
+ # cat=data[i]['categories']
118
+ # cats.append(cat)
119
+ # result=[j for i in cats for j in i]
120
+ # res = [i for n, i in enumerate(result) if i not in result[:n]]
121
+ # return res
122
+
123
+ def average_pool(last_hidden_states: Tensor,
124
+ attention_mask: Tensor) -> Tensor:
125
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
126
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
127
+
128
+ def get_wiki_category(pat_text):
129
+ # print(pat_text)
130
+ keywords=kw_model.extract_keywords(pat_text,keyphrase_ngram_range=(1, 3),top_n=3,vectorizer=KeyphraseCountVectorizer())
131
+ titles=[]
132
+ for i in range(len(keywords)):
133
+ title=keywords[i][0]
134
+ titles.append(title)
135
+
136
+ batch_dict = tokenizer(titles, padding=True, truncation=True, return_tensors='pt')
137
+ outputs = model(**batch_dict)
138
+ embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
139
+ embeddings = F.normalize(embeddings, p=2, dim=1)
140
+ values = embeddings.tolist()
141
+
142
+ catgories_list = []
143
+ for value in values:
144
+ try:
145
+ response = index.query(vector=value,top_k=3,include_metadata=True)
146
+ except:
147
+ pinecone.init(api_key='b33ddf5d-5b1a-4d0e-9a3f-572008563791',environment='gcp-starter')
148
+ index = pinecone.Index("wiki-index")
149
+ response = index.query(vector=value,top_k=5,include_metadata=True)
150
+
151
+ catgories = response['matches'][0]['metadata']['categories']
152
+ catgories_list.append(catgories.split(','))
153
+ flatList = [element for innerList in catgories_list for element in innerList]
154
+ new_list = [item.replace("'", '') for item in flatList]
155
+ a_list = [s.strip() for s in new_list]
156
+ test_list = list(set(a_list))
157
+ # result=[j for i in flatList for j in i]
158
+ # res = [i for n, i in enumerate(result) if i not in result[:n]]
159
+ return test_list
160
+
161
+ def main():
162
+ st.title('Wiki Classifier')
163
+
164
+ pnkc_no = st.text_input("Enter a pnkc number:")
165
+ pat_text = st.text_area("Enter a text paragraph:")
166
+
167
+ if st.button('Get Wiki categories'):
168
+
169
+ if pnkc_no:
170
+ text = get_pat_text(pnkc_no)
171
+ else:
172
+ text=pat_text
173
+
174
+ st.write("Predicting Wiki Categories for text:",text[:200])
175
+ start_time = time.time()
176
+ titles,wiki_categories=get_wiki_category_aprch_1(text)
177
+ end_time = time.time()
178
+ st.write({f"Wiki_titles for {pnkc_no} Text":titles})
179
+ st.write({f"Wiki_categories for {pnkc_no} Text":wiki_categories})
180
+
181
+ if __name__ == "__main__":
182
+ main()
requirements (1).txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ keybert
2
+ BeautifulSoup4
3
+ boto3
4
+ keyphrase_vectorizers
5
+ datasets
6
+ pinecone-client
7
+ transformers
8
+ torch
wikicat_all.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:465af8d1afc3362775ad4af1f09cc83ca7732193c09c47eccd3bf4e5f5c1e172
3
+ size 83555140