import re from nltk.stem import PorterStemmer, WordNetLemmatizer # Keyword Extraction def expand_list_of_lists(list_of_lists): """ Expands a list of lists of strings to a list of strings. Args: list_of_lists: A list of lists of strings. Returns: A list of strings. """ expanded_list = [] for inner_list in list_of_lists: for string in inner_list: expanded_list.append(string) return expanded_list def keywords_no_companies(texts): # Company list (to remove companies from extracted entities) company_list = [ "apple", "amd", "amazon", "cisco", "google", "microsoft", "nvidia", "asml", "intel", "micron", "aapl", "csco", "msft", "asml", "nvda", "googl", "mu", "intc", "amzn", "amd", ] texts = [text.split(" ") for text in texts] texts = expand_list_of_lists(texts) # Convert all strings to lowercase. lower_texts = [text.lower() for text in texts] keywords = [text for text in lower_texts if text not in company_list] return keywords def all_keywords_combs(texts): texts = [text.split(" ") for text in texts] texts = expand_list_of_lists(texts) # Convert all strings to lowercase. lower_texts = [text.lower() for text in texts] # Stem the words in each string. stemmer = PorterStemmer() stem_texts = [stemmer.stem(text) for text in texts] # Lemmatize the words in each string. lemmatizer = WordNetLemmatizer() lemm_texts = [lemmatizer.lemmatize(text) for text in texts] texts.extend(lower_texts) texts.extend(stem_texts) texts.extend(lemm_texts) return texts def extract_keywords(query_text, model): prompt = "###Instruction: Identify the key entities that accurately describe the context.\n\nInput:{query_text}\n\n###Response:" #prompt = f"###Instruction:Extract the important keywords which describe the context accurately.\n\nInput:{query_text}\n\n###Response:" response = model.predict(prompt) keywords = response.split(", ") keywords = keywords_no_companies(keywords) return keywords # Entity Extraction def generate_alpaca_ner_prompt(query): prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Use the following guidelines to extract the entities representing the Company, Quarter, and Year in the sentence. ### Instruction: - The output should be in the form "Company - Value, Quarter - Value, Year - Value". - The output should be in the form "Company - None, Quarter - None, Year - None", if no entities are found. - Only use entities that exist in the final sentence. - If Company cannot be found in the sentence, return "none" for that entity. - If Quarter cannot be found in the sentence, return "none" for that entity. - If Year cannot be found in the sentence, return "none" for that entity. - If there is ambiguity finding the entity, return "none" for that entity. ### Input: What was discussed regarding Services revenue performance in Apple's Q3 2020 earnings call? Company - Apple, Quarter - Q3, Year - 2020 How has the growth in Q1 been for the consumer market as seen by AMD? Company - AMD, Quarter - Q1, Year - none What was the long term view on GOOGL's cloud business growth as discussed in their earnings call? Company - Google, Quarter - none, Year - none What is Nvidia's outlook in the data center business in Q3 2020? Company - Nvidia, Quarter - Q3, Year - 2020 What are the expansion plans of Amazon in the Asia Pacific region as discussed in their earnings call? Company - Amazon, Quarter - none, Year - none What did the Analysts ask about CSCO's cybersecurity business in the earnings call in 2016? Company - Cisco, Quarter - none, Year - 2016 {query} ### Response:""" return prompt def format_entities_flan_alpaca(values): """ Extracts the text for each entity from the output generated by the Flan-Alpaca model. """ try: company_string, quarter_string, year_string = values.split(", ") except: company = None quarter = None year = None try: company = company_string.split(" - ")[1].lower() company = None if company.lower() == "none" else company except: company = None try: quarter = quarter_string.split(" - ")[1] quarter = None if quarter.lower() == "none" else quarter except: quarter = None try: year = year_string.split(" - ")[1] year = None if year.lower() == "none" else year except: year = None print((company, quarter, year)) return company, quarter, year def extract_quarter_year(string): # Extract year from string year_match = re.search(r"\d{4}", string) if year_match: year = year_match.group() else: year = None # Extract quarter from string quarter_match = re.search(r"Q\d", string) if quarter_match: quarter = "Q" + quarter_match.group()[1] else: quarter = None return quarter, year def extract_ticker_spacy(query, model): doc = model(query) entities = {ent.label_: ent.text for ent in doc.ents} print(entities.keys()) if "ORG" in entities.keys(): company = entities["ORG"].lower() else: company = None return company def clean_entities(company, quarter, year): company_ticker_map = { "apple": "AAPL", "amd": "AMD", "amazon": "AMZN", "cisco": "CSCO", "google": "GOOGL", "microsoft": "MSFT", "nvidia": "NVDA", "asml": "ASML", "intel": "INTC", "micron": "MU", } ticker_choice = [ "AAPL", "CSCO", "MSFT", "ASML", "NVDA", "GOOGL", "MU", "INTC", "AMZN", "AMD", ] year_choice = ["2020", "2019", "2018", "2017", "2016", "All"] quarter_choice = ["Q1", "Q2", "Q3", "Q4", "All"] if company is not None: if company in company_ticker_map.keys(): ticker = company_ticker_map[company] ticker_index = ticker_choice.index(ticker) else: ticker_index = 0 else: ticker_index = 0 if quarter is not None: if quarter in quarter_choice: quarter_index = quarter_choice.index(quarter) else: quarter_index = len(quarter_choice) - 1 else: quarter_index = len(quarter_choice) - 1 if year is not None: if year in year_choice: year_index = year_choice.index(year) else: year_index = len(year_choice) - 1 else: year_index = len(year_choice) - 1 return ticker_index, quarter_index, year_index