Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Running

App Files Files Community

vtiyyal1 commited on 1 day ago

Commit

12cca3e

•

1 Parent(s): fb70715

Upload 10 files

Browse files

latest code to check citations fix

Files changed (10) hide show

.gitattributes +35 -35
README.md +13 -13
app.py +80 -0
feed_to_llm.py +101 -0
feed_to_llm_v2.py +102 -0
full_chain.py +46 -0
get_articles.py +140 -0
get_keywords.py +63 -0
requirements.txt +15 -0
rerank.py +273 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
----
-title: Tobacco Watcher Chat With Citations
-emoji: 👁
-colorFrom: yellow
-colorTo: gray
-sdk: gradio
-sdk_version: 5.6.0
-app_file: app.py
-pinned: false
-short_description: https://tobaccowatcher.globaltobaccocontrol.org/
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Tobacco Watcher Chat
+emoji: 🐨
+colorFrom: indigo
+colorTo: red
+sdk: gradio
+sdk_version: 4.25.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import openai
+import gradio as gr
+from full_chain import get_response
+import os
+import logging
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('app.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# Initialize OpenAI client
+try:
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY environment variable not set")
+    client = openai.OpenAI(api_key=api_key)
+    logger.info("OpenAI client initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize OpenAI client: {str(e)}")
+    raise
+def create_hyperlink(url, title, domain):
+    """Create HTML hyperlink with domain information."""
+    return f"<a href='{url}'>{title}</a> ({domain})"
+def predict(message, history):
+    """Process user message and return response with source links."""
+    try:
+        logger.info(f"Processing new query: {message}")
+        # Get response from the chain
+        responder, links, titles, domains = get_response(message, rerank_type="crossencoder")
+        logger.info(f"Received response with {len(links)} sources")
+        # Create hyperlinks for sources
+        formatted_links = [create_hyperlink(link, title, domain)
+                         for link, title, domain in zip(links, titles, domains)]
+        # Combine response with sources
+        out = responder + "\n" + "\n".join(formatted_links)
+        logger.info("Response generated successfully")
+        return out
+    except Exception as e:
+        error_msg = f"Error processing query: {str(e)}"
+        logger.error(error_msg)
+        return f"An error occurred while processing your request: {str(e)}"
+# Define example queries
+EXAMPLE_QUERIES = [
+    "How many Americans Smoke?",
+    "What are some measures taken by the Indian Government to reduce the smoking population?",
+    "Does smoking negatively affect my health?"
+]
+# Initialize and launch Gradio interface
+def main():
+    try:
+        interface = gr.ChatInterface(
+            predict,
+            examples=EXAMPLE_QUERIES,
+            title="Tobacco Information Assistant",
+            description="Ask questions about tobacco-related topics and get answers with reliable sources."
+        )
+        logger.info("Starting Gradio interface")
+        interface.launch()
+    except Exception as e:
+        logger.error(f"Failed to launch Gradio interface: {str(e)}")
+        raise
+if __name__ == "__main__":
+    main()

feed_to_llm.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from langchain.chat_models import ChatOpenAI
+from langchain.schema import (
+    HumanMessage,
+    SystemMessage
+)
+import tiktoken
+import re
+def num_tokens_from_string(string: str, encoder) -> int:
+    num_tokens = len(encoder.encode(string))
+    return num_tokens
+def feed_articles_to_gpt_with_links(information, question):
+    prompt = "The following pieces of information includes relevant articles. \nUse the following sentences to answer question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer. "
+    prompt += "Please state the number of the article used to answer the question after your response\n"
+    end_prompt = "\n----------------\n"
+    prompt += end_prompt
+    content = ""
+    seperator = "<<<<>>>>"
+    token_count = 0
+    encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    token_count += num_tokens_from_string(prompt, encoder)
+    articles = [contents for score, contents, uuids, titles, domains in information]
+    uuids = [uuids for score, contents, uuids, titles, domains in information]
+    domains = [domains for score, contents, uuids, titles, domains in information]
+    for i in range(len(articles)):
+        addition = "Article " + str(i + 1) + ": " + articles[i] + seperator
+        addition += articles[i] + seperator
+        token_count += num_tokens_from_string(addition, encoder)
+        if token_count > 3500:
+            print(i)
+            break
+        content += addition
+    prompt += content
+    llm = ChatOpenAI(temperature=0.0)
+    message = [
+        SystemMessage(content=prompt),
+        HumanMessage(content=question)
+    ]
+    response = llm(message)
+    print(response.content)
+    print("response length: ", len(response.content))
+    answer_found_prompt = "Please check if the following response found the answer. If yes, return 1 and if no, return 0. \n"
+    message = [
+        SystemMessage(content=answer_found_prompt),
+        HumanMessage(content=response.content)
+    ]
+    print(llm(message).content)
+    if llm(message).content == "0":
+        return "I could not find the answer.", [], [], []
+    # sources = "\n Sources: \n"
+    # for i in range(len(uuids)):
+    #     link = "https://tobaccowatcher.globaltobaccocontrol.org/articles/" + uuids[i] + "/" + "\n"
+    #     sources += link
+    # response.content += sources
+    lowercase_response = response.content.lower()
+    # remove parentheses
+    lowercase_response = re.sub('[()]', '', lowercase_response)
+    lowercase_split = lowercase_response.split()
+    used_article_num = []
+    for i in range(len(lowercase_split)):
+        if lowercase_split[i] == "article":
+            next_word = lowercase_split[i + 1]
+            # get rid of non-numenric characters
+            next_word = ''.join(c for c in next_word if c.isdigit())
+            print("Article number: ", next_word)
+            # append only if it is not present in the list
+            if next_word not in used_article_num:
+                used_article_num.append(next_word)
+    # if empty
+    print("Used article num: ", used_article_num)
+    if not used_article_num:
+        print("I could not find the answer. Reached")
+        return "I could not find the answer.", [], [], []
+    used_article_num = [int(num) - 1 for num in used_article_num]
+    links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
+    titles = [titles for score, contents, uuids, titles, domains in information]
+    links = [links[i] for i in used_article_num]
+    titles = [titles[i] for i in used_article_num]
+    domains = [domains[i] for i in used_article_num]
+    # get rid of substring that starts with (Article and ends with )
+    response_without_source = re.sub("""\(Article.*\)""", "", response.content)
+    return response_without_source, links, titles, domains

feed_to_llm_v2.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from langchain_openai import ChatOpenAI
+from langchain.schema import (
+    HumanMessage,
+    SystemMessage
+)
+import tiktoken
+import re
+from get_articles import save_solr_articles_full
+from rerank import crossencoder_rerank_answer
+def num_tokens_from_string(string: str, encoder) -> int:
+    num_tokens = len(encoder.encode(string))
+    return num_tokens
+def feed_articles_to_gpt_with_links(information, question):
+    prompt = """
+    You are a Question Answering system specializing in tobacco-related topics. You have access to several curated articles, each numbered (e.g., Article 1, Article 2). These articles cover various aspects of tobacco use, health effects, legislation, and quitting resources.
+    When formulating your response, adhere to the following guidelines:
+    1. Use information from the provided articles to directly answer the question. Explicitly reference the article(s) used in your response by stating the article number(s) (e.g., "According to Article 1, ..." or "Articles 2 and 3 mention that...").
+    2. If the answer is not covered by any of the articles, clearly state that the information is unavailable. Do not guess or fabricate information.
+    3. Avoid using ambiguous time references like 'recently' or 'last year.' Instead, use absolute terms based on the article's content (e.g., 'In 2021' or 'As per Article 2, published in 2020').
+    4. Keep responses concise, accurate, and helpful while maintaining a professional tone.
+    Below is a list of articles you can reference. Each article is identified by its number and content:
+    """
+    end_prompt = "\n----------------\n"
+    prompt += end_prompt
+    content = ""
+    separator = "<<<<>>>>"
+    token_count = 0
+    # Encoder setup for token count tracking
+    encoder = tiktoken.encoding_for_model("gpt-3.5-turbo")
+    token_count += num_tokens_from_string(prompt, encoder)
+    # Add articles to the prompt
+    articles = [contents for score, contents, uuids, titles, domains in information]
+    uuids = [uuids for score, contents, uuids, titles, domains in information]
+    titles_list = [titles for score, contents, uuids, titles, domains in information]
+    domains_list = [domains for score, contents, uuids, titles, domains in information]
+    for i in range(len(articles)):
+        addition = f"Article {i + 1}: {articles[i]} {separator}"
+        token_count += num_tokens_from_string(addition, encoder)
+        if token_count > 3500:
+            break
+        content += addition
+    prompt += content
+    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
+    message = [
+        SystemMessage(content=prompt),
+        HumanMessage(content=question)
+    ]
+    response = llm.invoke(message)
+    response_content = response.content  # Access the content of the AIMessage
+    print("LLM Response Content:", response_content)
+    # Extract sources from the response content
+    matches = re.findall(r'\((Article \d+)\)', response_content)
+    if not matches:
+        print("No sources found in the response.")
+        return response_content, [], [], []
+    unique_matches = list(set(matches))
+    used_article_nums = [int(re.findall(r'\d+', match)[0]) - 1 for match in unique_matches]
+    # Create citation list
+    citations = []
+    for idx, num in enumerate(used_article_nums, start=1):
+        citation = f"{idx}. {titles_list[num]} ({domains_list[num]})"
+        citations.append(citation)
+    # Replace article numbers with citation numbers in response
+    for i, match in enumerate(unique_matches, start=1):
+        response_content = response_content.replace(match, f"[{i}]")
+    # Append citations to the response
+    response_with_citations = f"{response_content}\n\nReferences:\n" + "\n".join(citations)
+    # Prepare links with titles and domains
+    links = [f"https://tobaccowatcher.globaltobaccocontrol.org/articles/{uuid}/" for uuid in uuids]
+    hyperlinks = [f"<a href='{link}' target='_blank'>{titles_list[i]}</a> ({domains_list[i]})" for i, link in enumerate(links)]
+    return response_with_citations, hyperlinks, titles_list, domains_list
+if __name__ == "__main__":
+    question = "How is United States fighting against tobacco addiction?"
+    rerank_type = "crossencoder"
+    llm_type = "chat"
+    csv_path = save_solr_articles_full(question, keyword_type="rake")
+    reranked_out = crossencoder_rerank_answer(csv_path, question)
+    feed_articles_to_gpt_with_links(reranked_out, question)

full_chain.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import pandas as pd
+from get_keywords import get_keywords
+from get_articles import save_solr_articles_full
+from rerank import langchain_rerank_answer, langchain_with_sources, crossencoder_rerank_answer, \
+    crossencoder_rerank_sentencewise, crossencoder_rerank_sentencewise_articles, no_rerank
+#from feed_to_llm import feed_articles_to_gpt_with_links
+from feed_to_llm_v2 import feed_articles_to_gpt_with_links
+def get_response(question, rerank_type="crossencoder", llm_type="chat"):
+    try:
+        csv_path = save_solr_articles_full(question, keyword_type="rake")
+        reranked_out = crossencoder_rerank_answer(csv_path, question)
+        # Prepare source metadata for citations
+        citations = [
+            {"title": article["title"], "url": article["url"], "source": article["source"]}
+            for article in reranked_out
+        ]
+        result = feed_articles_to_gpt_with_links(reranked_out, question, citations)
+        return result
+    except Exception as e:
+        return "", [], [], []
+if __name__ == "__main__":
+    question = "How is United States fighting against tobacco addiction?"
+    rerank_type = "crossencoder"
+    llm_type = "chat"
+    response, links, titles, domains = get_response(question, rerank_type, llm_type)
+    print(response)
+    print(links)
+    print(titles)
+    print(domains)

get_articles.py ADDED Viewed

	@@ -0,0 +1,140 @@

+from pysolr import Solr
+import os
+import csv
+from sentence_transformers import SentenceTransformer, util
+import torch
+from get_keywords import get_keywords
+import os
+"""
+This function creates top 15 articles from Solr and saves them in a csv file
+Input:
+    query: str
+    num_articles: int
+    keyword_type: str (openai, rake, or na)
+Output: path to csv file
+"""
+def save_solr_articles_full(query: str, num_articles=15, keyword_type="openai") -> str:
+    keywords = get_keywords(query, keyword_type)
+    if keyword_type == "na":
+        keywords = query
+    return save_solr_articles(keywords, num_articles)
+"""
+Removes spaces and newlines from text
+Input: text: str
+Output: text: str
+"""
+def remove_spaces_newlines(text: str) -> str:
+    text = text.replace('\n', ' ')
+    text = text.replace('  ', ' ')
+    return text
+# truncates long articles to 1500 words
+def truncate_article(text: str) -> str:
+    split = text.split()
+    if len(split) > 1500:
+        split = split[:1500]
+        text = ' '.join(split)
+    return text
+"""
+Searches Solr for articles based on keywords and saves them in a csv file
+Input:
+    keywords: str
+    num_articles: int
+Output: path to csv file
+Minor details:
+    Removes duplicate articles to start with.
+    Articles with dead urls are removed since those articles are often wierd.
+    Articles with titles that start with five starting words are removed. they are usually duplicates with minor changes.
+    If one of title, uuid, cleaned_content, url are missing the article is skipped.
+"""
+def save_solr_articles(keywords: str, num_articles=15) -> str:
+    solr_key = os.getenv("SOLR_KEY")
+    SOLR_ARTICLES_URL = f"https://website:{solr_key}@solr.machines.globalhealthwatcher.org:8080/solr/articles/"
+    solr = Solr(SOLR_ARTICLES_URL, verify=False)
+    # No duplicates
+    fq = ['-dups:0']
+    query = f'text:({keywords})' + " AND " + "dead_url:(false)"
+    # Get top 2*num_articles articles and then remove misformed or duplicate articles
+    outputs = solr.search(query, fq=fq, sort="score desc", rows=num_articles * 2)
+    article_count = 0
+    save_path = os.path.join("data", "articles.csv")
+    if not os.path.exists(os.path.dirname(save_path)):
+        os.makedirs(os.path.dirname(save_path))
+    with open(save_path, 'w', newline='') as csvfile:
+        fieldnames = ['title', 'uuid', 'content', 'url', 'domain']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
+        writer.writeheader()
+        title_five_words = set()
+        for d in outputs.docs:
+            if article_count == num_articles:
+                break
+            # skip if title returns a keyerror
+            if 'title' not in d or 'uuid' not in d or 'cleaned_content' not in d or 'url' not in d:
+                continue
+            title_cleaned = remove_spaces_newlines(d['title'])
+            split = title_cleaned.split()
+            # skip if title is a duplicate
+            if not len(split) < 5:
+                five_words = title_cleaned.split()[:5]
+                five_words = ' '.join(five_words)
+                if five_words in title_five_words:
+                    continue
+                title_five_words.add(five_words)
+            article_count += 1
+            cleaned_content = remove_spaces_newlines(d['cleaned_content'])
+            cleaned_content = truncate_article(cleaned_content)
+            domain = ""
+            if 'domain' not in d:
+                domain = "Not Specified"
+            else:
+                domain = d['domain']
+            print(domain)
+            writer.writerow({'title': title_cleaned, 'uuid': d['uuid'], 'content': cleaned_content, 'url': d['url'],
+                             'domain': domain})
+    return save_path
+def save_embedding_base_articles(query, article_embeddings, titles, contents, uuids, urls, num_articles=15):
+    bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
+    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    hits = util.semantic_search(query_embedding, article_embeddings, top_k=15)
+    hits = hits[0]
+    corpus_ids = [item['corpus_id'] for item in hits]
+    r_contents = [contents[idx] for idx in corpus_ids]
+    r_titles = [titles[idx] for idx in corpus_ids]
+    r_uuids = [uuids[idx] for idx in corpus_ids]
+    r_urls = [urls[idx] for idx in corpus_ids]
+    save_path = os.path.join("data", "articles.csv")
+    if not os.path.exists(os.path.dirname(save_path)):
+        os.makedirs(os.path.dirname(save_path))
+    with open(save_path, 'w', newline='', encoding="utf-8") as csvfile:
+        fieldNames = ['title', 'uuid', 'content', 'url']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldNames, quoting=csv.QUOTE_NONNUMERIC)
+        writer.writeheader()
+        for i in range(num_articles):
+            writer.writerow({'title': r_titles[i], 'uuid': r_uuids[i], 'content': r_contents[i], 'url': r_urls[i]})
+    return save_path

get_keywords.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from langchain_openai import ChatOpenAI
+from langchain_core.messages import (
+    HumanMessage,
+    SystemMessage
+)
+from rake_nltk import Rake
+import nltk
+nltk.download('stopwords')
+nltk.download('punkt')
+"""
+This function takes in user query and returns keywords
+Input:
+    user_query: str
+    keyword_type: str (openai, rake, or na)
+        If the keyword type is na, then user query is returned.
+Output: keywords: str
+"""
+def get_keywords(user_query: str, keyword_type: str) -> str:
+    if keyword_type == "openai":
+        return get_keywords_openai(user_query)
+    if keyword_type == "rake":
+        return get_keywords_rake(user_query)
+    else:
+        return user_query
+"""
+This function takes user query and returns keywords using rake_nltk
+rake_nltk actually returns keyphrases, not keywords. Since using keyphrases did not show improvement, we are using keywords
+to match the output type of the other keyword functions.
+Input:
+    user_query: str
+Output: keywords: str
+"""
+def get_keywords_rake(user_query: str) -> str:
+    r = Rake()
+    r.extract_keywords_from_text(user_query)
+    keyphrases = r.get_ranked_phrases()
+    # If we want to get keyphrases, return keyphrases but should do keywords
+    out = ""
+    for phrase in keyphrases:
+        out += phrase + " "
+    return out
+"""
+This function takes user query and returns keywords using openai
+Input:
+    user_query: str
+Output: keywords: str
+"""
+def get_keywords_openai(user_query: str) -> str:
+    llm = ChatOpenAI(temperature=0.0)
+    command = "return the keywords of the following query. response should be words separated by commas. "
+    message = [
+        SystemMessage(content=command),
+        HumanMessage(content=user_query)
+    ]
+    response = llm(message)
+    res = response.content.replace(",", "")
+    return res

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+gradio==4.25.0
+langchain==0.1.14
+langchain-core==0.1.40
+langchain-openai==0.1.1
+nltk==3.8.1
+openai==1.16.2
+pandas==2.2.1
+pysolr==3.9.0
+rake-nltk==1.0.6
+sentence-transformers==2.2.2
+tiktoken==0.5.2
+torch==2.1.2
+huggingface-hub==0.20.2
+python-dotenv==1.0.1
+docarray==0.40.0

rerank.py ADDED Viewed

	@@ -0,0 +1,273 @@

+# reranks the top articles from a given csv file
+from langchain_openai import ChatOpenAI
+from langchain.chains import RetrievalQA
+from langchain_community.document_loaders.csv_loader import CSVLoader
+from langchain_community.vectorstores import DocArrayInMemorySearch
+from sentence_transformers import CrossEncoder
+import pandas as pd
+import time
+"""
+This function rerank top articles (15 -> 4) from a given csv, then sends to LLM
+Input:
+    csv_path: str
+    question: str
+    top_n: int
+Output:
+    response: str
+    links: list of str
+    titles: list of str
+Other functions in this file does not send articles to LLM. This is an exception.
+Created using langchain RAG functions. Deprecated.
+Update: Use langchain_RAG instead.
+"""
+def langchain_rerank_answer(csv_path, question, source='url', top_n=4):
+    llm = ChatOpenAI(temperature=0.0)
+    loader = CSVLoader(csv_path, source_column="url")
+    index = VectorstoreIndexCreator(
+        vectorstore_cls=DocArrayInMemorySearch,
+    ).from_loaders([loader])
+    # prompt_template = """You are an a chatbot that answers tobacco related questions with source. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
+    # {context}
+    # Question: {question}"""
+    # PROMPT = PromptTemplate(
+    # template=prompt_template, input_variables=["context", "question"]
+    # )
+    # chain_type_kwargs = {"prompt": PROMPT}
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=index.vectorstore.as_retriever(),
+        verbose=False,
+        return_source_documents=True,
+        # chain_type_kwargs=chain_type_kwargs,
+        # chain_type_kwargs = {
+        #     "document_separator": "<<<<>>>>>"
+        # },
+    )
+    answer = qa({"query": question})
+    sources = answer['source_documents']
+    sources_out = [source.metadata['source'] for source in sources]
+    return answer['result'], sources_out
+"""
+    Langchain with sources.
+    This function is deprecated. Use langchain_RAG instead.
+"""
+def langchain_with_sources(csv_path, question, top_n=4):
+    llm = ChatOpenAI(temperature=0.0)
+    loader = CSVLoader(csv_path, source_column="uuid")
+    index = VectorstoreIndexCreator(
+        vectorstore_cls=DocArrayInMemorySearch,
+    ).from_loaders([loader])
+    qa = RetrievalQAWithSourcesChain.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=index.vectorstore.as_retriever(),
+    )
+    output = qa({"question": question}, return_only_outputs=True)
+    return output['answer'], output['sources']
+"""
+    Reranks the top articles using crossencoder.
+    Uses cross-encoder/ms-marco-MiniLM-L-6-v2 for embedding / reranking.
+    Input:
+        csv_path: str
+        question: str
+        top_n: int
+    Output:
+        out_values: list of [content, uuid, title]
+"""
+# returns list of top n similar articles using crossencoder
+def crossencoder_rerank_answer(csv_path: str, question: str, top_n=4) -> list:
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    articles = pd.read_csv(csv_path)
+    contents = articles['content'].tolist()
+    uuids = articles['uuid'].tolist()
+    titles = articles['title'].tolist()
+    # biencoder retrieval does not have domain
+    if 'domain' not in articles:
+        domain = [""] * len(contents)
+    else:
+        domain = articles['domain'].tolist()
+    cross_inp = [[question, content] for content in contents]
+    cross_scores = cross_encoder.predict(cross_inp)
+    scores_sentences = list(zip(cross_scores, contents, uuids, titles, domain))
+    scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
+    out_values = scores_sentences[:top_n]
+    # if score is less than 0, truncate
+    for idx in range(len(out_values)):
+        if out_values[idx][0] < 0:
+            out_values = out_values[:idx]
+            if len(out_values) == 0:
+                out_values = scores_sentences[:1]
+            break
+    # print(out_values)
+    return out_values
+def crossencoder_rerank_sentencewise(csv_path: str, question: str, top_n=10) -> list:
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    articles = pd.read_csv(csv_path)
+    contents = articles['content'].tolist()
+    uuids = articles['uuid'].tolist()
+    titles = articles['title'].tolist()
+    if 'domain' not in articles:
+        domain = [""] * len(contents)
+    else:
+        domain = articles['domain'].tolist()
+    sentences = []
+    new_uuids = []
+    new_titles = []
+    new_domains = []
+    for idx in range(len(contents)):
+        sents = sent_tokenize(contents[idx])
+        sentences.extend(sents)
+        new_uuids.extend([uuids[idx]] * len(sents))
+        new_titles.extend([titles[idx]] * len(sents))
+        new_domains.extend([domain[idx]] * len(sents))
+    cross_inp = [[question, sent] for sent in sentences]
+    cross_scores = cross_encoder.predict(cross_inp)
+    scores_sentences = list(zip(cross_scores, sentences, new_uuids, new_titles, new_domains))
+    scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
+    out_values = scores_sentences[:top_n]
+    # if score is less than 0, truncate
+    for idx in range(len(out_values)):
+        if out_values[idx][0] < 0:
+            out_values = out_values[:idx]
+            if len(out_values) == 0:
+                out_values = scores_sentences[:1]
+            break
+    return out_values
+def crossencoder_rerank_sentencewise_sentence_chunks(csv_path, question, top_n=10, chunk_size=2):
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    articles = pd.read_csv(csv_path)
+    contents = articles['content'].tolist()
+    uuids = articles['uuid'].tolist()
+    titles = articles['title'].tolist()
+    # embeddings do not have domain as column
+    if 'domain' not in articles:
+        domain = [""] * len(contents)
+    else:
+        domain = articles['domain'].tolist()
+    sentences = []
+    new_uuids = []
+    new_titles = []
+    new_domains = []
+    for idx in range(len(contents)):
+        sents = sent_tokenize(contents[idx])
+        sents_merged = []
+        # if the number of sentences is less than chunk size, merge and join
+        if len(sents) < chunk_size:
+            sents_merged.append(' '.join(sents))
+        else:
+            for i in range(0, len(sents) - chunk_size + 1):
+                sents_merged.append(' '.join(sents[i:i + chunk_size]))
+        sentences.extend(sents_merged)
+        new_uuids.extend([uuids[idx]] * len(sents_merged))
+        new_titles.extend([titles[idx]] * len(sents_merged))
+        new_domains.extend([domain[idx]] * len(sents_merged))
+    cross_inp = [[question, sent] for sent in sentences]
+    cross_scores = cross_encoder.predict(cross_inp)
+    scores_sentences = list(zip(cross_scores, sentences, new_uuids, new_titles, new_domains))
+    scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
+    out_values = scores_sentences[:top_n]
+    for idx in range(len(out_values)):
+        if out_values[idx][0] < 0:
+            out_values = out_values[:idx]
+            if len(out_values) == 0:
+                out_values = scores_sentences[:1]
+            break
+    return out_values
+def crossencoder_rerank_sentencewise_articles(csv_path, question, top_n=4):
+    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    contents, uuids, titles, domain = load_articles(csv_path)
+    sentences = []
+    contents_elongated = []
+    new_uuids = []
+    new_titles = []
+    new_domains = []
+    for idx in range(len(contents)):
+        sents = sent_tokenize(contents[idx])
+        sentences.extend(sents)
+        new_uuids.extend([uuids[idx]] * len(sents))
+        contents_elongated.extend([contents[idx]] * len(sents))
+        new_titles.extend([titles[idx]] * len(sents))
+        new_domains.extend([domain[idx]] * len(sents))
+    cross_inp = [[question, sent] for sent in sentences]
+    cross_scores = cross_encoder.predict(cross_inp)
+    scores_sentences = list(zip(cross_scores, contents_elongated, new_uuids, new_titles, new_domains))
+    scores_sentences = sorted(scores_sentences, key=lambda x: x[0], reverse=True)
+    score_sentences_compressed = []
+    for item in scores_sentences:
+        if not score_sentences_compressed:
+            score_sentences_compressed.append(item)
+        else:
+            if item[2] not in [x[2] for x in score_sentences_compressed]:
+                score_sentences_compressed.append(item)
+    scores_sentences = score_sentences_compressed
+    return scores_sentences[:top_n]
+def no_rerank(csv_path, question, top_n=4):
+    contents, uuids, titles, domains = load_articles(csv_path)
+    return list(zip(contents, uuids, titles, domains))[:top_n]
+def load_articles(csv_path:str):
+    articles = pd.read_csv(csv_path)
+    contents = articles['content'].tolist()
+    uuids = articles['uuid'].tolist()
+    titles = articles['title'].tolist()
+    if 'domain' not in articles:
+        domain = [""] * len(contents)
+    else:
+        domain = articles['domain'].tolist()
+    return contents, uuids, titles, domain