Spaces:

ESG-TFM-UV
/

ESG_API_BATCH

Build error

App Files Files Community

rdose commited on Sep 20, 2022

Commit

1314a1a

•

1 Parent(s): 9fd0daa

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -104

app.py CHANGED Viewed

@@ -1,12 +1,21 @@
 import numpy as np
 import onnxruntime
 import onnx
 import gradio as gr
-import requests
-import json
 try:
     from extractnet import Extractor
@@ -27,28 +36,20 @@ except ImportError:
 print('[i] Using',EXTRACTOR_NET)
-import math
-from transformers import AutoTokenizer
 import spacy
-import os
-from transformers import pipeline
-import itertools
-import pandas as pd
-# from bertopic import BERTopic
-# from huggingface_hub import hf_hub_url, cached_download
-# import nltk
-# nltk.download('stopwords')
-# nltk.download('wordnet')
-# nltk.download('omw-1.4')
-# from nltk.corpus import stopwords
-# from nltk.stem import WordNetLemmatizer
-# from nltk.stem import PorterStemmer
-# from unicodedata import normalize
-# import re
 OUT_HEADERS = ['E','S','G']
@@ -57,82 +58,85 @@ DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
 MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
 MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
 MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
-# BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
-# BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
-# bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
-# def _topic_sanitize_word(text):
-#     """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
-#     text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
-#     text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
-#     text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
-#     text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
-#     text = re.sub(r'\n', '', text) #Elimina saltos de linea
-#     text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
-#     text = re.sub(r'[“”]', '', text) # Elimina caracter citas
-#     text = re.sub(r'[()]', '', text) # Elimina parentesis
-#     text = re.sub('\.', '', text) # Elimina punto
-#     text = re.sub('\,', '', text) # Elimina coma
-#     text = re.sub('’s', '', text) # Elimina posesivos
-#     #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
-#     text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
-#     # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
-#     text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
-#     # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
-#     text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
-#                   normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
-#     # -> NFC (Normalization Form Canonical Composition)
-#     text = normalize( 'NFC', text)
-#     return text.lower().strip()
-# def _topic_clean_text(text, lemmatize=True, stem=True):
-#   words = text.split()
-#   non_stopwords = [word for word in words if word not in stopwords.words('english')]
-#   clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
-#   if lemmatize:
-#     lemmatizer = WordNetLemmatizer()
-#     clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
-#   if stem:
-#     ps =PorterStemmer()
-#     clean_text = [ps.stem(word) for word in clean_text]
-#   return ' '.join(clean_text).strip()
-# #SECTOR_LIST = list(DF_SP500.Sector.unique())
-# SECTOR_LIST = ['Industry',
-#                'Health',
-#                'Technology',
-#                'Communication',
-#                'Consumer Staples',
-#                'Consumer Discretionary',
-#                'Utilities',
-#                'Financials',
-#                'Materials',
-#                'Real Estate',
-#                'Energy']
-# SECTOR_TOPICS = []
-# for sector in SECTOR_LIST:
-#   topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
-#   SECTOR_TOPICS.append(topics)
-# def _topic2sector(pred_topics):
-#   out = []
-#   for pred_topic in pred_topics:
-#     relevant_sectors = []
-#     for i in range(len(SECTOR_LIST)):
-#       if pred_topic in SECTOR_TOPICS[i]:
-#         relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
-#     out.append(relevant_sectors)
-#   return out
-# def _inference_topic_match(text):
-#   out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
-#   return out
 def get_company_sectors(extracted_names, threshold=0.95):
   '''
@@ -184,7 +188,6 @@ def filter_spans(spans, keep_longest=True):
     return result
 def _inference_ner_spancat(text, limit_outputs=10):
     nlp = spacy.load("en_pipeline")
     out = []
@@ -264,7 +267,7 @@ def _inference_classifier(text):
     return sigmoid(ort_outs[0])
-def inference(input_batch,isurl,use_archive,limit_companies=10):
     url_list = []    #Only used if isurl
     input_batch_content = []
 #    if file_in.name is not "":
@@ -285,7 +288,8 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
     if isurl:
         print("[i] Data is URL")
         if use_archive:
-            print("[i] Use chached URL from archive.org")
         for row_in in input_batch_r:
             if isinstance(row_in , list):
                 url = row_in[0]
@@ -324,9 +328,10 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
     sentiment = _inference_sentiment_model_pipeline(input_batch_content )
     print("[i] Running NER using custom spancat inference...")
     ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
-    # print("[i] BERTopic...")
-    # topics = _inference_topic_match(input_batch_content)
     df = pd.DataFrame(prob_outs,columns =['E','S','G'])
     if isurl:
         df['URL'] = url_list
@@ -334,6 +339,7 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
         df['content_id'] = range(1, len(input_batch_r)+1)
     df['sent_lbl'] = [d['label'] for d in sentiment ]
     df['sent_score'] = [d['score'] for d in sentiment ]
     #df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
     print("[i] Pandas output shape:",df.shape)
@@ -343,7 +349,9 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
     for idx in range(len(df.index)):
       if ner_labels[idx]: #not empty
         for ner in ner_labels[idx]:
           df = pd.concat( [df, df.loc[[idx]].assign(company=ner[0], sector=ner[1])], join='outer', ignore_index=True) #axis=0
     return df #ner_labels, {'E':float(prob_outs[0]),"S":float(prob_outs[1]),"G":float(prob_outs[2])},{sentiment['label']:float(sentiment['score'])},"**Summary:**\n\n" + summary
@@ -359,6 +367,7 @@ API input parameters:
 - List: list of text. Either list of Url of the news (english) or list of extracted news contents
 - 'Data type': int. 0=list is of extracted news contents, 1=list is of urls.
 - `use_archive`: boolean. The model will extract the archived version in archive.org of the url indicated. This is useful with old news and to bypass news behind paywall
 - `limit_companies`: integer. Number of found relevant companies to report.
 """
@@ -370,11 +379,12 @@ examples = [[ [['https://www.bbc.com/news/uk-62732447'],
             ['https://www.bbc.com/news/world-europe-62766867'],
             ['https://www.bbc.com/news/business-62524031'],
             ['https://www.bbc.com/news/business-62728621'],
-            ['https://www.bbc.com/news/science-environment-62680423']],'url',False,5]]
 demo = gr.Interface(fn=inference,
                     inputs=[gr.Dataframe(label='input batch', col_count=1, datatype='str', type='array', wrap=True),
                             gr.Dropdown(label='data type', choices=['text','url'], type='index', value='url'),
-                            gr.Checkbox(label='if url parse cached in archive.org'),
                             gr.Slider(minimum=1, maximum=10, step=1, label='Limit NER output', value=5)],
                     outputs=[gr.Dataframe(label='output raw', col_count=1, type='pandas', wrap=True, header=OUT_HEADERS)],
                              #gr.Label(label='Company'),

+import os
+import re
+import math
+import requests
+import json
+import itertools
 import numpy as np
+import pandas as pd
 import onnxruntime
 import onnx
 import gradio as gr
+from huggingface_hub import hf_hub_url, cached_download
+from transformers import AutoTokenizer
+from transformers import pipeline
 try:
     from extractnet import Extractor
 print('[i] Using',EXTRACTOR_NET)
 import spacy
+from bertopic import BERTopic
+import nltk
+nltk.download('stopwords')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.stem import PorterStemmer
+from unicodedata import normalize
 OUT_HEADERS = ['E','S','G']
 MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
 MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
 MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
+#MODEL3
+#BERTOPIC_REPO_ID = "oMateos2020/BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
+#BERTOPIC_FILENAME = "BERTopic-paraphrase-MiniLM-L3-v2-51topics-guided-model3"
+#bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )), embedding_model="paraphrase-MiniLM-L3-v2")
+BERTOPIC_REPO_ID = "oMateos2020/BERTopic-distilbert-base-nli-mean-tokens"
+BERTOPIC_FILENAME = "BERTopic-distilbert-base-nli-mean-tokens"
+bertopic_model = BERTopic.load(cached_download(hf_hub_url(BERTOPIC_REPO_ID , BERTOPIC_FILENAME )))
+#SECTOR_LIST = list(DF_SP500.Sector.unique())
+SECTOR_LIST = ['Industry',
+               'Health',
+               'Technology',
+               'Communication',
+               'Consumer Staples',
+               'Consumer Discretionary',
+               'Utilities',
+               'Financials',
+               'Materials',
+               'Real Estate',
+               'Energy']
+def _topic_sanitize_word(text):
+    """Función realiza una primera limpieza-normalización del texto a traves de expresiones regex"""
+    text = re.sub(r'@[\w_]+|#[\w_]+|https?://[\w_./]+', '', text) # Elimina menciones y URL, esto sería más para Tweets pero por si hay alguna mención o URL al ser criticas web
+    text = re.sub('\S*@\S*\s?', '', text) # Elimina correos electronicos
+    text = re.sub(r'\((\d+)\)', '', text) #Elimina numeros entre parentesis
+    text = re.sub(r'^\d+', '', text) #Elimina numeros sueltos
+    text = re.sub(r'\n', '', text) #Elimina saltos de linea
+    text = re.sub('\s+', ' ', text) # Elimina espacios en blanco adicionales
+    text = re.sub(r'[“”]', '', text) # Elimina caracter citas
+    text = re.sub(r'[()]', '', text) # Elimina parentesis
+    text = re.sub('\.', '', text) # Elimina punto
+    text = re.sub('\,', '', text) # Elimina coma
+    text = re.sub('’s', '', text) # Elimina posesivos
+    #text = re.sub(r'-+', '', text) # Quita guiones para unir palabras compuestas (normalizaría algunos casos, exmujer y ex-mujer, todos a exmujer)
+    text = re.sub(r'\.{3}', ' ', text) # Reemplaza puntos suspensivos
+    # Esta exp regular se ha incluido "a mano" tras ver que era necesaria para algunos ejemplos
+    text = re.sub(r"([\.\?])", r"\1 ", text) # Introduce espacio despues de punto e interrogacion
+    # -> NFD (Normalization Form Canonical Decomposition) y eliminar diacríticos
+    text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
+                  normalize( "NFD", text), 0, re.I) # Eliminación de diacriticos (acentos y variantes puntuadas de caracteres por su forma simple excepto la 'ñ')
+    # -> NFC (Normalization Form Canonical Composition)
+    text = normalize( 'NFC', text)
+    return text.lower().strip()
+def _topic_clean_text(text, lemmatize=True, stem=True):
+  words = text.split()
+  non_stopwords = [word for word in words if word not in stopwords.words('english')]
+  clean_text = [_topic_sanitize_word(word) for word in non_stopwords]
+  if lemmatize:
+    lemmatizer = WordNetLemmatizer()
+    clean_text = [lemmatizer.lemmatize(word) for word in clean_text]
+  if stem:
+    ps =PorterStemmer()
+    clean_text = [ps.stem(word) for word in clean_text]
+  return ' '.join(clean_text).strip()
+SECTOR_TOPICS = []
+for sector in SECTOR_LIST:
+  topics, _ = bertopic_model.find_topics(_topic_clean_text(sector), top_n=5)
+  SECTOR_TOPICS.append(topics)
+def _topic2sector(pred_topics):
+  out = []
+  for pred_topic in pred_topics:
+    relevant_sectors = []
+    for i in range(len(SECTOR_LIST)):
+      if pred_topic in SECTOR_TOPICS[i]:
+        relevant_sectors.append(list(DF_SP500.Sector.unique())[i])
+    out.append(relevant_sectors)
+  return out
+def _inference_topic_match(text):
+  out, _ = bertopic_model.transform([_topic_clean_text(t) for t in text])
+  return out
 def get_company_sectors(extracted_names, threshold=0.95):
   '''
     return result
 def _inference_ner_spancat(text, limit_outputs=10):
     nlp = spacy.load("en_pipeline")
     out = []
     return sigmoid(ort_outs[0])
+def inference(input_batch,isurl,use_archive,filt_companies_topic,limit_companies=10):
     url_list = []    #Only used if isurl
     input_batch_content = []
 #    if file_in.name is not "":
     if isurl:
         print("[i] Data is URL")
         if use_archive:
+            print("[i] Use chached URL from archive.org")
+        print("[i] Extracting contents using",EXTRACTOR_NET)
         for row_in in input_batch_r:
             if isinstance(row_in , list):
                 url = row_in[0]
     sentiment = _inference_sentiment_model_pipeline(input_batch_content )
     print("[i] Running NER using custom spancat inference...")
     ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
+    print("[i] Extracting topic using custom BERTopic...")
+    topics = _inference_topic_match(input_batch_content)
+    news_sectors = _topic2sector(topics)
     df = pd.DataFrame(prob_outs,columns =['E','S','G'])
     if isurl:
         df['URL'] = url_list
         df['content_id'] = range(1, len(input_batch_r)+1)
     df['sent_lbl'] = [d['label'] for d in sentiment ]
     df['sent_score'] = [d['score'] for d in sentiment ]
+    df['topic'] = pd.DataFrame(news_sectors).iloc[:, 0]
     #df['sector_pred'] = pd.DataFrame(_topic2sector(topics)).iloc[:, 0]
     print("[i] Pandas output shape:",df.shape)
     for idx in range(len(df.index)):
       if ner_labels[idx]: #not empty
         for ner in ner_labels[idx]:
+          if filt_companies_topic:
+              if news_sectors[idx] != ner[1]:
+                  continue
           df = pd.concat( [df, df.loc[[idx]].assign(company=ner[0], sector=ner[1])], join='outer', ignore_index=True) #axis=0
     return df #ner_labels, {'E':float(prob_outs[0]),"S":float(prob_outs[1]),"G":float(prob_outs[2])},{sentiment['label']:float(sentiment['score'])},"**Summary:**\n\n" + summary
 - List: list of text. Either list of Url of the news (english) or list of extracted news contents
 - 'Data type': int. 0=list is of extracted news contents, 1=list is of urls.
 - `use_archive`: boolean. The model will extract the archived version in archive.org of the url indicated. This is useful with old news and to bypass news behind paywall
+- `filter_companies`: boolean. Filter companies by news' topic
 - `limit_companies`: integer. Number of found relevant companies to report.
 """
             ['https://www.bbc.com/news/world-europe-62766867'],
             ['https://www.bbc.com/news/business-62524031'],
             ['https://www.bbc.com/news/business-62728621'],
+            ['https://www.bbc.com/news/science-environment-62680423']],'url',False,False,5]]
 demo = gr.Interface(fn=inference,
                     inputs=[gr.Dataframe(label='input batch', col_count=1, datatype='str', type='array', wrap=True),
                             gr.Dropdown(label='data type', choices=['text','url'], type='index', value='url'),
+                            gr.Checkbox(label='Parse cached in archive.org'),
+                            gr.Checkbox(label='Filter out companies by topic'),
                             gr.Slider(minimum=1, maximum=10, step=1, label='Limit NER output', value=5)],
                     outputs=[gr.Dataframe(label='output raw', col_count=1, type='pandas', wrap=True, header=OUT_HEADERS)],
                              #gr.Label(label='Company'),