|
import re |
|
|
|
from nltk.stem import PorterStemmer, WordNetLemmatizer |
|
|
|
|
|
|
|
|
|
def expand_list_of_lists(list_of_lists): |
|
""" |
|
Expands a list of lists of strings to a list of strings. |
|
Args: |
|
list_of_lists: A list of lists of strings. |
|
Returns: |
|
A list of strings. |
|
""" |
|
|
|
expanded_list = [] |
|
for inner_list in list_of_lists: |
|
for string in inner_list: |
|
expanded_list.append(string) |
|
return expanded_list |
|
|
|
|
|
def keywords_no_companies(texts): |
|
|
|
|
|
company_list = [ |
|
"apple", |
|
"amd", |
|
"amazon", |
|
"cisco", |
|
"google", |
|
"microsoft", |
|
"nvidia", |
|
"asml", |
|
"intel", |
|
"micron", |
|
"aapl", |
|
"csco", |
|
"msft", |
|
"asml", |
|
"nvda", |
|
"googl", |
|
"mu", |
|
"intc", |
|
"amzn", |
|
"amd", |
|
] |
|
|
|
texts = [text.split(" ") for text in texts] |
|
texts = expand_list_of_lists(texts) |
|
|
|
|
|
lower_texts = [text.lower() for text in texts] |
|
keywords = [text for text in lower_texts if text not in company_list] |
|
return keywords |
|
|
|
|
|
def all_keywords_combs(texts): |
|
|
|
texts = [text.split(" ") for text in texts] |
|
texts = expand_list_of_lists(texts) |
|
|
|
|
|
lower_texts = [text.lower() for text in texts] |
|
|
|
|
|
stemmer = PorterStemmer() |
|
stem_texts = [stemmer.stem(text) for text in texts] |
|
|
|
|
|
lemmatizer = WordNetLemmatizer() |
|
lemm_texts = [lemmatizer.lemmatize(text) for text in texts] |
|
|
|
texts.extend(lower_texts) |
|
texts.extend(stem_texts) |
|
texts.extend(lemm_texts) |
|
return texts |
|
|
|
|
|
def extract_keywords(query_text, model): |
|
prompt = "###Instruction: Identify the key entities that accurately describe the context.\n\nInput:{query_text}\n\n###Response:" |
|
|
|
response = model.predict(prompt) |
|
keywords = response.split(", ") |
|
keywords = keywords_no_companies(keywords) |
|
return keywords |
|
|
|
|
|
|
|
|
|
|
|
def generate_alpaca_ner_prompt(query): |
|
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Use the following guidelines to extract the entities representing the Company, Quarter, and Year in the sentence. |
|
|
|
### Instruction: |
|
- The output should be in the form "Company - Value, Quarter - Value, Year - Value". |
|
- The output should be in the form "Company - None, Quarter - None, Year - None", if no entities are found. |
|
- Only use entities that exist in the final sentence. |
|
- If Company cannot be found in the sentence, return "none" for that entity. |
|
- If Quarter cannot be found in the sentence, return "none" for that entity. |
|
- If Year cannot be found in the sentence, return "none" for that entity. |
|
- If there is ambiguity finding the entity, return "none" for that entity. |
|
|
|
### Input: |
|
|
|
What was discussed regarding Services revenue performance in Apple's Q3 2020 earnings call? |
|
Company - Apple, Quarter - Q3, Year - 2020 |
|
|
|
How has the growth in Q1 been for the consumer market as seen by AMD? |
|
Company - AMD, Quarter - Q1, Year - none |
|
|
|
What was the long term view on GOOGL's cloud business growth as discussed in their earnings call? |
|
Company - Google, Quarter - none, Year - none |
|
|
|
What is Nvidia's outlook in the data center business in Q3 2020? |
|
Company - Nvidia, Quarter - Q3, Year - 2020 |
|
|
|
What are the expansion plans of Amazon in the Asia Pacific region as discussed in their earnings call? |
|
Company - Amazon, Quarter - none, Year - none |
|
|
|
What did the Analysts ask about CSCO's cybersecurity business in the earnings call in 2016? |
|
Company - Cisco, Quarter - none, Year - 2016 |
|
|
|
|
|
{query} |
|
### Response:""" |
|
return prompt |
|
|
|
|
|
def format_entities_flan_alpaca(values): |
|
""" |
|
Extracts the text for each entity from the output generated by the |
|
Flan-Alpaca model. |
|
""" |
|
try: |
|
company_string, quarter_string, year_string = values.split(", ") |
|
except: |
|
company = None |
|
quarter = None |
|
year = None |
|
try: |
|
company = company_string.split(" - ")[1].lower() |
|
company = None if company.lower() == "none" else company |
|
except: |
|
company = None |
|
try: |
|
quarter = quarter_string.split(" - ")[1] |
|
quarter = None if quarter.lower() == "none" else quarter |
|
|
|
except: |
|
quarter = None |
|
try: |
|
year = year_string.split(" - ")[1] |
|
year = None if year.lower() == "none" else year |
|
|
|
except: |
|
year = None |
|
|
|
print((company, quarter, year)) |
|
return company, quarter, year |
|
|
|
|
|
def extract_quarter_year(string): |
|
|
|
year_match = re.search(r"\d{4}", string) |
|
if year_match: |
|
year = year_match.group() |
|
else: |
|
year = None |
|
|
|
|
|
quarter_match = re.search(r"Q\d", string) |
|
if quarter_match: |
|
quarter = "Q" + quarter_match.group()[1] |
|
else: |
|
quarter = None |
|
|
|
return quarter, year |
|
|
|
|
|
def extract_ticker_spacy(query, model): |
|
doc = model(query) |
|
entities = {ent.label_: ent.text for ent in doc.ents} |
|
print(entities.keys()) |
|
if "ORG" in entities.keys(): |
|
company = entities["ORG"].lower() |
|
else: |
|
company = None |
|
return company |
|
|
|
|
|
def clean_entities(company, quarter, year): |
|
company_ticker_map = { |
|
"apple": "AAPL", |
|
"amd": "AMD", |
|
"amazon": "AMZN", |
|
"cisco": "CSCO", |
|
"google": "GOOGL", |
|
"microsoft": "MSFT", |
|
"nvidia": "NVDA", |
|
"asml": "ASML", |
|
"intel": "INTC", |
|
"micron": "MU", |
|
} |
|
|
|
ticker_choice = [ |
|
"AAPL", |
|
"CSCO", |
|
"MSFT", |
|
"ASML", |
|
"NVDA", |
|
"GOOGL", |
|
"MU", |
|
"INTC", |
|
"AMZN", |
|
"AMD", |
|
] |
|
year_choice = ["2020", "2019", "2018", "2017", "2016", "All"] |
|
quarter_choice = ["Q1", "Q2", "Q3", "Q4", "All"] |
|
if company is not None: |
|
if company in company_ticker_map.keys(): |
|
ticker = company_ticker_map[company] |
|
ticker_index = ticker_choice.index(ticker) |
|
else: |
|
ticker_index = 0 |
|
else: |
|
ticker_index = 0 |
|
if quarter is not None: |
|
if quarter in quarter_choice: |
|
quarter_index = quarter_choice.index(quarter) |
|
else: |
|
quarter_index = len(quarter_choice) - 1 |
|
else: |
|
quarter_index = len(quarter_choice) - 1 |
|
if year is not None: |
|
if year in year_choice: |
|
year_index = year_choice.index(year) |
|
else: |
|
year_index = len(year_choice) - 1 |
|
else: |
|
year_index = len(year_choice) - 1 |
|
return ticker_index, quarter_index, year_index |
|
|