Spaces:

zhenyundeng
/

AVeriTeC-API

Build error

App Files Files Community

zhenyundeng commited on Jul 19, 2024

Commit

016ab20

1 Parent(s): 200e5b6

update files

Browse files

Files changed (3) hide show

.gitattributes +3 -1
app.py +179 -11
utils.py +3 -1

.gitattributes CHANGED Viewed

@@ -25,7 +25,6 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +32,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.db filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -69,7 +69,9 @@ nlp = spacy.load("en_core_web_sm")
 # ---------------------------------------------------------------------------
 # Load sample dict for AVeriTeC search
 # all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
 # ---------------------------------------------------------------------------
 # ---------- Load pretrained models        ----------
 # ---------- load Evidence retrieval model ----------
@@ -424,9 +426,8 @@ def QAprediction(claim, evidence, sources):
 # ----------GoogleAPIretriever---------
 def generate_reference_corpus(reference_file):
-    with open(reference_file) as f:
-        #
-        train_examples = json.load(f)
     all_data_corpus = []
     tokenized_corpus = []
@@ -578,6 +579,12 @@ def get_and_store(url_link, fp, worker, worker_stack):
     gc.collect()
 def get_google_search_results(api_key, search_engine_id, google_search, sort_date, search_string, page=0):
     search_results = []
     for i in range(3):
@@ -599,7 +606,7 @@ def get_google_search_results(api_key, search_engine_id, google_search, sort_dat
     return search_results
-def averitec_search(claim, generate_question, speaker="they", check_date="2024-07-01", n_pages=1):  # n_pages=3
     # default config
     api_key = os.environ["GOOGLE_API_KEY"]
     search_engine_id = os.environ["GOOGLE_SEARCH_ENGINE_ID"]
@@ -651,7 +658,6 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
         for page_num in range(n_pages):
             search_results = get_google_search_results(api_key, search_engine_id, google_search, sort_date,
                                                        this_search_string, page=page_num)
-            search_results = search_results[:5]
             for result in search_results:
                 link = str(result["link"])
@@ -668,8 +674,6 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
                 if link.endswith(".pdf") or link.endswith(".doc"):
                     continue
-                store_file_path = ""
                 if link in visited:
                     store_file_path = visited[link]
                 else:
@@ -678,7 +682,7 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
                         store_counter) + ".store"
                     visited[link] = store_file_path
-                    while len(worker_stack) == 0:  # Wait for a wrrker to become available. Check every second.
                         sleep(1)
                     worker = worker_stack.pop()
@@ -692,6 +696,89 @@ def averitec_search(claim, generate_question, speaker="they", check_date="2024-0
     return retrieve_evidence
 def claim2prompts(example):
     claim = example["claim"]
@@ -725,8 +812,8 @@ def claim2prompts(example):
 def generate_step2_reference_corpus(reference_file):
-    with open(reference_file) as f:
-        train_examples = json.load(f)
     prompt_corpus = []
     tokenized_corpus = []
@@ -762,6 +849,87 @@ def decorate_with_questions(claim, retrieve_evidence, top_k=10):  # top_k=100
     tokenized_corpus = []
     all_data_corpus = []
     for retri_evi in tqdm.tqdm(retrieve_evidence):
         store_file = retri_evi[-1]
@@ -1222,7 +1390,7 @@ def chat(claim, history, sources):
     try:
         # Log answer on Azure Blob Storage
         # IF AZURE_ISSAVE=TRUE, save the logs into the Azure share client.
-        if bool(os.environ["AZURE_ISSAVE"]):
             timestamp = str(datetime.now().timestamp())
             # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
             file = timestamp + ".json"

 # ---------------------------------------------------------------------------
 # Load sample dict for AVeriTeC search
 # all_samples_dict = json.load(open('averitec/data/all_samples.json', 'r'))
+train_examples = json.load(open('averitec/data/train.json', 'r'))
+print(train_examples[0]['claim'])
 # ---------------------------------------------------------------------------
 # ---------- Load pretrained models        ----------
 # ---------- load Evidence retrieval model ----------
 # ----------GoogleAPIretriever---------
 def generate_reference_corpus(reference_file):
+    # with open(reference_file) as f:
+    #     train_examples = json.load(f)
     all_data_corpus = []
     tokenized_corpus = []
     gc.collect()
+def get_text_from_link(url_link):
+    page_lines = url2lines(url_link)
+    return "\n".join([url_link] + page_lines)
 def get_google_search_results(api_key, search_engine_id, google_search, sort_date, search_string, page=0):
     search_results = []
     for i in range(3):
     return search_results
+def averitec_search_michael(claim, generate_question, speaker="they", check_date="2024-07-01", n_pages=1):  # n_pages=3
     # default config
     api_key = os.environ["GOOGLE_API_KEY"]
     search_engine_id = os.environ["GOOGLE_SEARCH_ENGINE_ID"]
         for page_num in range(n_pages):
             search_results = get_google_search_results(api_key, search_engine_id, google_search, sort_date,
                                                        this_search_string, page=page_num)
             for result in search_results:
                 link = str(result["link"])
                 if link.endswith(".pdf") or link.endswith(".doc"):
                     continue
                 if link in visited:
                     store_file_path = visited[link]
                 else:
                         store_counter) + ".store"
                     visited[link] = store_file_path
+                    while len(worker_stack) == 0:  # Wait for a worker to become available. Check every second.
                         sleep(1)
                     worker = worker_stack.pop()
     return retrieve_evidence
+def averitec_search(claim, generate_question, speaker="they", check_date="2024-07-01", n_pages=1):  # n_pages=3
+    # default config
+    api_key = os.environ["GOOGLE_API_KEY"]
+    search_engine_id = os.environ["GOOGLE_SEARCH_ENGINE_ID"]
+    blacklist = [
+        "jstor.org",  # Blacklisted because their pdfs are not labelled as such, and clog up the download
+        "facebook.com",  # Blacklisted because only post titles can be scraped, but the scraper doesn't know this,
+        "ftp.cs.princeton.edu",  # Blacklisted because it hosts many large NLP corpora that keep showing up
+        "nlp.cs.princeton.edu",
+        "huggingface.co"
+    ]
+    blacklist_files = [  # Blacklisted some NLP nonsense that crashes my machine with OOM errors
+        "/glove.",
+        "ftp://ftp.cs.princeton.edu/pub/cs226/autocomplete/words-333333.txt",
+        "https://web.mit.edu/adamrose/Public/googlelist",
+    ]
+    # save to folder
+    store_folder = "averitec/data/store/retrieved_docs"
+    #
+    index = 0
+    questions = [q["question"] for q in generate_question]
+    # check the date of the claim
+    current_date = datetime.now().strftime("%Y-%m-%d")
+    sort_date = check_claim_date(current_date)  # check_date="2022-01-01"
+    #
+    search_strings = []
+    search_types = []
+    search_string_2 = string_to_search_query(claim, None)
+    search_strings += [search_string_2, claim, ]
+    search_types += ["claim", "claim-noformat", ]
+    search_strings += questions
+    search_types += ["question" for _ in questions]
+    # start to search
+    search_results = []
+    visited = {}
+    store_counter = 0
+    worker_stack = list(range(10))
+    retrieve_evidence = []
+    for this_search_string, this_search_type in zip(search_strings, search_types):
+        for page_num in range(n_pages):
+            search_results = get_google_search_results(api_key, search_engine_id, google_search, sort_date,
+                                                       this_search_string, page=page_num)
+            search_results = search_results[:5]
+            for result in search_results:
+                link = str(result["link"])
+                domain = get_domain_name(link)
+                if domain in blacklist:
+                    continue
+                broken = False
+                for b_file in blacklist_files:
+                    if b_file in link:
+                        broken = True
+                if broken:
+                    continue
+                if link.endswith(".pdf") or link.endswith(".doc"):
+                    continue
+                store_file_path = ""
+                if link in visited:
+                    web_text = visited[link]
+                else:
+                    web_text = get_text_from_link(link)
+                    visited[link] = web_text
+                line = [str(index), claim, link, str(page_num), this_search_string, this_search_type, web_text]
+                retrieve_evidence.append(line)
+    return retrieve_evidence
 def claim2prompts(example):
     claim = example["claim"]
 def generate_step2_reference_corpus(reference_file):
+    # with open(reference_file) as f:
+    #     train_examples = json.load(f)
     prompt_corpus = []
     tokenized_corpus = []
     tokenized_corpus = []
     all_data_corpus = []
+    for retri_evi in tqdm.tqdm(retrieve_evidence):
+        # store_file = retri_evi[-1]
+        # with open(store_file, 'r') as f:
+        web_text = retri_evi[-1]
+        lines_in_web = web_text.split("\n")
+        first = True
+        for line in lines_in_web:
+        # for line in f:
+            line = line.strip()
+            if first:
+                first = False
+                location_url = line
+                continue
+            if len(line) > 3:
+                entry = nltk.word_tokenize(line)
+                if (location_url, line) not in all_data_corpus:
+                    tokenized_corpus.append(entry)
+                    all_data_corpus.append((location_url, line))
+    if len(tokenized_corpus) == 0:
+        print("")
+    bm25 = BM25Okapi(tokenized_corpus)
+    s = bm25.get_scores(nltk.word_tokenize(claim))
+    top_n = np.argsort(s)[::-1][:top_k]
+    docs = [all_data_corpus[i] for i in top_n]
+    generate_qa_pairs = []
+    # Then, generate questions for those top 50:
+    for doc in tqdm.tqdm(docs):
+        # prompt_lookup_str = example["claim"] + " " + doc[1]
+        prompt_lookup_str = doc[1]
+        prompt_s = prompt_bm25.get_scores(nltk.word_tokenize(prompt_lookup_str))
+        prompt_n = 10
+        prompt_top_n = np.argsort(prompt_s)[::-1][:prompt_n]
+        prompt_docs = [prompt_corpus[i] for i in prompt_top_n]
+        claim_prompt = "Evidence: " + doc[1].replace("\n", " ") + "\nQuestion answered: "
+        prompt = "\n\n".join(prompt_docs + [claim_prompt])
+        sentences = [prompt]
+        inputs = qg_tokenizer(sentences, padding=True, return_tensors="pt").to(device)
+        outputs = qg_model.generate(inputs["input_ids"], max_length=5000, num_beams=2, no_repeat_ngram_size=2,
+                                 early_stopping=True)
+        tgt_text = qg_tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
+        # We are not allowed to generate more than 250 characters:
+        tgt_text = tgt_text[:250]
+        qa_pair = [tgt_text.strip().split("?")[0].replace("\n", " ") + "?", doc[1].replace("\n", " "), doc[0]]
+        generate_qa_pairs.append(qa_pair)
+    return generate_qa_pairs
+def decorate_with_questions_michale(claim, retrieve_evidence, top_k=10):  # top_k=100
+    #
+    reference_file = "averitec/data/train.json"
+    tokenized_corpus, prompt_corpus = generate_step2_reference_corpus(reference_file)
+    prompt_bm25 = BM25Okapi(tokenized_corpus)
+    # Define the bloom model:
+    accelerator = Accelerator()
+    accel_device = accelerator.device
+    # device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    # tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-7b1")
+    # model = BloomForCausalLM.from_pretrained(
+    #     "bigscience/bloom-7b1",
+    #     device_map="auto",
+    #     torch_dtype=torch.bfloat16,
+    #     offload_folder="./offload"
+    # )
+    #
+    tokenized_corpus = []
+    all_data_corpus = []
     for retri_evi in tqdm.tqdm(retrieve_evidence):
         store_file = retri_evi[-1]
     try:
         # Log answer on Azure Blob Storage
         # IF AZURE_ISSAVE=TRUE, save the logs into the Azure share client.
+        if os.environ["AZURE_ISSAVE"] == "TRUE":
             timestamp = str(datetime.now().timestamp())
             # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
             file = timestamp + ".json"

utils.py CHANGED Viewed

@@ -2,11 +2,13 @@ import numpy as np
 import random
 import string
 import uuid
 def create_user_id():
     """Create user_id
         str: String to id user
     """
     user_id = str(uuid.uuid4())
-    return user_id

 import random
 import string
 import uuid
+from datetime import datetime
 def create_user_id():
     """Create user_id
         str: String to id user
     """
+    current_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
     user_id = str(uuid.uuid4())
+    return current_date + '_' +user_id