Spaces:

ESG-TFM-UV
/

ESG_API_BATCH

Build error

App Files Files Community

rdose commited on Sep 12, 2022

Commit

b3e926c

•

1 Parent(s): 03d1953

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -16

app.py CHANGED Viewed

@@ -12,8 +12,10 @@ import os
 from transformers import pipeline
 import itertools
 import pandas as pd
 OUT_HEADERS = ['E','S','G']
 MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
 MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
@@ -24,24 +26,59 @@ MODEL_SENTIMENT_ANALYSIS = "ProsusAI/finbert"
 #API_HF_SENTIMENT_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
 def _inference_ner_spancat(text, summary, penalty=0.5, normalise=True, limit_outputs=10):
     nlp = spacy.load("en_pipeline")
-    doc = nlp(text)
-    spans = doc.spans["sc"]
-    comp_raw_text = dict( sorted( dict(zip([str(x) for x in spans],[float(x)*penalty for x in spans.attrs['scores']])).items(), key=lambda x: x[1], reverse=True) )
-    doc = nlp(summary)
-    spans = doc.spans["sc"]
-    exceeds_one = 0.0
-    for comp_s in spans:
-        if str(comp_s) in comp_raw_text.keys():
-            comp_raw_text[str(comp_s)] = comp_raw_text[str(comp_s)] / penalty
-            temp_max = comp_raw_text[str(comp_s)]if comp_raw_text[str(comp_s)] > 1.0 else 0.0
-            exceeds_one = comp_raw_text[str(comp_s)] if temp_max > exceeds_one else exceeds_one
-    #This "exceeds_one" is a bit confusing. So the thing is that the penalty is reverted for each time the company appears in the summary and hence the value can exceed one when the company appears more than once. The normalisation means that all the other scores are divided by the maximum when any value exceeds one
-    if normalise and (exceeds_one > 1):
-        comp_raw_text = {k: v/exceeds_one for k, v in comp_raw_text.items()}
-    return dict(itertools.islice(sorted(comp_raw_text.items(), key=lambda x: x[1], reverse=True), limit_outputs))
 #def _inference_summary_model_pipeline(text):
 #    pipe = pipeline("text2text-generation", model=MODEL_SUMMARY_PEGASUS)
@@ -162,8 +199,10 @@ def inference(input_batch,isurl,use_archive,limit_companies=10):
     print("[i] Running sentiment using",MODEL_SENTIMENT_ANALYSIS ,"inference...")
     #sentiment = _inference_sentiment_model_via_api_query({"inputs": extracted['content']})
     sentiment = _inference_sentiment_model_pipeline(input_batch_content )
     #summary = _inference_summary_model_pipeline(input_batch_content )[0]['generated_text']
-    #ner_labels = _inference_ner_spancat(input_batch_content ,summary, penalty = 0.8, limit_outputs=limit_companies)
     df = pd.DataFrame(prob_outs,columns =['E','S','G'])
     if isurl:
         df['URL'] = url_list

 from transformers import pipeline
 import itertools
 import pandas as pd
+import thefuzz
 OUT_HEADERS = ['E','S','G']
+DF_SP500 = pd.read_csv('SP500_constituents.zip',compression=dict(method='zip'))
 MODEL_TRANSFORMER_BASED = "distilbert-base-uncased"
 MODEL_ONNX_FNAME = "ESG_classifier_batch.onnx"
 #API_HF_SENTIMENT_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
+def get_company_sectors(extracted_names, threshold=0.95):
+  '''
+  '''
+  output = []
+  standard_names_tuples = []
+  for extracted_name in extracted_names:
+    name_match = thefuzz.process.extractOne(extracted_name,
+                                            DF_SP500.Name,
+                                            scorer=thefuzz.fuzz.token_set_ratio)
+    similarity = name_match[1]/100
+    if similarity >= threshold:
+      standard_names_tuples.append(name_match[:2])
+  for std_comp_name, _ in standard_names_tuples:
+    sectors = list(DF_SP500[['Name','Sector']].where(DF_SP500.Name == std_comp_name).dropna().itertuples(index=False, name=None))
+    output += sectors
+  return output
+def filter_spans(spans, keep_longest=True):
+    """Filter a sequence of spans and remove duplicates or overlaps. Useful for
+    creating named entities (where one token can only be part of one entity) or
+    when merging spans with `Retokenizer.merge`. When spans overlap, the (first)
+    longest span is preferred over shorter spans.
+    spans (Iterable[Span]): The spans to filter.
+    keep_longest (bool): Specify whether to keep longer or shorter spans.
+    RETURNS (List[Span]): The filtered spans.
+    """
+    get_sort_key = lambda span: (span.end - span.start, -span.start)
+    sorted_spans = sorted(spans, key=get_sort_key, reverse=keep_longest)
+    #print(f'sorted_spans: {sorted_spans}')
+    result = []
+    seen_tokens = set()
+    for span in sorted_spans:
+        # Check for end - 1 here because boundaries are inclusive
+        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
+            result.append(span)
+            seen_tokens.update(range(span.start, span.end))
+    result = sorted(result, key=lambda span: span.start)
+    return result
 def _inference_ner_spancat(text, summary, penalty=0.5, normalise=True, limit_outputs=10):
     nlp = spacy.load("en_pipeline")
+    out = []
+    for doc in nlp.pipe(text):
+        spans = doc.spans["sc"]
+        #comp_raw_text = dict( sorted( dict(zip([str(x) for x in spans],[float(x)*penalty for x in spans.attrs['scores']])).items(), key=lambda x: x[1], reverse=True) )
+        company_list = list(set([str(span).replace('\'s', '') for span in filter_spans(spans, keep_longest=True)]))[:limit_outputs]
+        out.append(get_company_sectors(company_list))
+    return out
 #def _inference_summary_model_pipeline(text):
 #    pipe = pipeline("text2text-generation", model=MODEL_SUMMARY_PEGASUS)
     print("[i] Running sentiment using",MODEL_SENTIMENT_ANALYSIS ,"inference...")
     #sentiment = _inference_sentiment_model_via_api_query({"inputs": extracted['content']})
     sentiment = _inference_sentiment_model_pipeline(input_batch_content )
+    print("[i] Running NER using custom spancat inference...")
     #summary = _inference_summary_model_pipeline(input_batch_content )[0]['generated_text']
+    ner_labels = _inference_ner_spancat(input_batch_content ,limit_outputs=limit_companies)
+    print(ner_labels)
     df = pd.DataFrame(prob_outs,columns =['E','S','G'])
     if isurl:
         df['URL'] = url_list