2-qa-earnings-sentencewise / utils /entity_extraction.py
awinml's picture
Upload 17 files (#18)
eb6952b
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Keyword Extraction
def expand_list_of_lists(list_of_lists):
"""
Expands a list of lists of strings to a list of strings.
Args:
list_of_lists: A list of lists of strings.
Returns:
A list of strings.
"""
expanded_list = []
for inner_list in list_of_lists:
for string in inner_list:
expanded_list.append(string)
return expanded_list
def keywords_no_companies(texts):
# Company list (to remove companies from extracted entities)
company_list = [
"apple",
"amd",
"amazon",
"cisco",
"google",
"microsoft",
"nvidia",
"asml",
"intel",
"micron",
"aapl",
"csco",
"msft",
"asml",
"nvda",
"googl",
"mu",
"intc",
"amzn",
"amd",
]
texts = [text.split(" ") for text in texts]
texts = expand_list_of_lists(texts)
# Convert all strings to lowercase.
lower_texts = [text.lower() for text in texts]
keywords = [text for text in lower_texts if text not in company_list]
return keywords
def all_keywords_combs(texts):
texts = [text.split(" ") for text in texts]
texts = expand_list_of_lists(texts)
# Convert all strings to lowercase.
lower_texts = [text.lower() for text in texts]
# Stem the words in each string.
stemmer = PorterStemmer()
stem_texts = [stemmer.stem(text) for text in texts]
# Lemmatize the words in each string.
lemmatizer = WordNetLemmatizer()
lemm_texts = [lemmatizer.lemmatize(text) for text in texts]
texts.extend(lower_texts)
texts.extend(stem_texts)
texts.extend(lemm_texts)
return texts
def extract_keywords(query_text, model):
prompt = "###Instruction: Identify the key entities that accurately describe the context.\n\nInput:{query_text}\n\n###Response:"
#prompt = f"###Instruction:Extract the important keywords which describe the context accurately.\n\nInput:{query_text}\n\n###Response:"
response = model.predict(prompt)
keywords = response.split(", ")
keywords = keywords_no_companies(keywords)
return keywords
# Entity Extraction
def generate_alpaca_ner_prompt(query):
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Use the following guidelines to extract the entities representing the Company, Quarter, and Year in the sentence.
### Instruction:
- The output should be in the form "Company - Value, Quarter - Value, Year - Value".
- The output should be in the form "Company - None, Quarter - None, Year - None", if no entities are found.
- Only use entities that exist in the final sentence.
- If Company cannot be found in the sentence, return "none" for that entity.
- If Quarter cannot be found in the sentence, return "none" for that entity.
- If Year cannot be found in the sentence, return "none" for that entity.
- If there is ambiguity finding the entity, return "none" for that entity.
### Input:
What was discussed regarding Services revenue performance in Apple's Q3 2020 earnings call?
Company - Apple, Quarter - Q3, Year - 2020
How has the growth in Q1 been for the consumer market as seen by AMD?
Company - AMD, Quarter - Q1, Year - none
What was the long term view on GOOGL's cloud business growth as discussed in their earnings call?
Company - Google, Quarter - none, Year - none
What is Nvidia's outlook in the data center business in Q3 2020?
Company - Nvidia, Quarter - Q3, Year - 2020
What are the expansion plans of Amazon in the Asia Pacific region as discussed in their earnings call?
Company - Amazon, Quarter - none, Year - none
What did the Analysts ask about CSCO's cybersecurity business in the earnings call in 2016?
Company - Cisco, Quarter - none, Year - 2016
{query}
### Response:"""
return prompt
def format_entities_flan_alpaca(values):
"""
Extracts the text for each entity from the output generated by the
Flan-Alpaca model.
"""
try:
company_string, quarter_string, year_string = values.split(", ")
except:
company = None
quarter = None
year = None
try:
company = company_string.split(" - ")[1].lower()
company = None if company.lower() == "none" else company
except:
company = None
try:
quarter = quarter_string.split(" - ")[1]
quarter = None if quarter.lower() == "none" else quarter
except:
quarter = None
try:
year = year_string.split(" - ")[1]
year = None if year.lower() == "none" else year
except:
year = None
print((company, quarter, year))
return company, quarter, year
def extract_quarter_year(string):
# Extract year from string
year_match = re.search(r"\d{4}", string)
if year_match:
year = year_match.group()
else:
year = None
# Extract quarter from string
quarter_match = re.search(r"Q\d", string)
if quarter_match:
quarter = "Q" + quarter_match.group()[1]
else:
quarter = None
return quarter, year
def extract_ticker_spacy(query, model):
doc = model(query)
entities = {ent.label_: ent.text for ent in doc.ents}
print(entities.keys())
if "ORG" in entities.keys():
company = entities["ORG"].lower()
else:
company = None
return company
def clean_entities(company, quarter, year):
company_ticker_map = {
"apple": "AAPL",
"amd": "AMD",
"amazon": "AMZN",
"cisco": "CSCO",
"google": "GOOGL",
"microsoft": "MSFT",
"nvidia": "NVDA",
"asml": "ASML",
"intel": "INTC",
"micron": "MU",
}
ticker_choice = [
"AAPL",
"CSCO",
"MSFT",
"ASML",
"NVDA",
"GOOGL",
"MU",
"INTC",
"AMZN",
"AMD",
]
year_choice = ["2020", "2019", "2018", "2017", "2016", "All"]
quarter_choice = ["Q1", "Q2", "Q3", "Q4", "All"]
if company is not None:
if company in company_ticker_map.keys():
ticker = company_ticker_map[company]
ticker_index = ticker_choice.index(ticker)
else:
ticker_index = 0
else:
ticker_index = 0
if quarter is not None:
if quarter in quarter_choice:
quarter_index = quarter_choice.index(quarter)
else:
quarter_index = len(quarter_choice) - 1
else:
quarter_index = len(quarter_choice) - 1
if year is not None:
if year in year_choice:
year_index = year_choice.index(year)
else:
year_index = len(year_choice) - 1
else:
year_index = len(year_choice) - 1
return ticker_index, quarter_index, year_index