Upload folder using huggingface_hub
Browse files- .gitignore +4 -0
- LICENCE +21 -0
- README.md +25 -6
- app.py +30 -0
- example.txt +1 -0
- fetch_evidence.py +184 -0
- flagged/log.csv +5 -0
- generate_search_queries.py +137 -0
- llm_api_utils.py +60 -0
- objective_claims.txt +3 -0
- pipeline_paths.py +25 -0
- requirements.in +18 -0
- requirements.txt +508 -0
- run_pipeline.py +134 -0
- step1_api_claim_extractor.py +164 -0
- step2_api_fix_passage_anchors.py +165 -0
- step3_api_identify_objective_claims.py +185 -0
- step41_api_fetch_cohere_wikipedia_evidence.py +97 -0
- step42_api_fetch_google_search_evidence.py +145 -0
- step5_api_embed_search_results.py +290 -0
- step6_api_claims_to_evidence.py +85 -0
- step7_api_check_claims_against_evidence.py +153 -0
- step8_api_format_fact_checked_document.py +121 -0
- subjective_claims.txt +1 -0
- text_utils.py +188 -0
- utils.py +18 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.venv
|
3 |
+
data
|
4 |
+
__pycache__/
|
LICENCE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 vladbogo, albanie, ioanacroi, abuonomo
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,31 @@
|
|
1 |
---
|
2 |
title: Filtir
|
3 |
-
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.19.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
1 |
---
|
2 |
title: Filtir
|
3 |
+
app_file: app.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.19.0
|
|
|
|
|
6 |
---
|
7 |
+
# Filtir - fact checking pipeline
|
8 |
+
|
9 |
+
This repo contains the Filtir pipeline for claim extraction and fact-checking.
|
10 |
+
|
11 |
+
## Prerequisites
|
12 |
+
|
13 |
+
### Create and prepare venv
|
14 |
+
```bash
|
15 |
+
python3 -m venv .venv
|
16 |
+
source .venv/bin/activate
|
17 |
+
pip install -r requirements.txt
|
18 |
+
```
|
19 |
+
|
20 |
+
### Setup keys
|
21 |
+
In order to run the code you need to set up the following keys and add them to .env:
|
22 |
+
|
23 |
+
- OPENAI_API_KEY - used to call the OpenAI API
|
24 |
+
- COHERE_API_KEY and WEAVIATE_API_KEY - used for Wikipedia search
|
25 |
+
- GOOGLE_CLOUD_API_KEY and GOOGLE_CUSTOM_SEARCH_ENGINE_ID - used for Google search
|
26 |
+
|
27 |
+
## Run the pipeline
|
28 |
|
29 |
+
```bash
|
30 |
+
python run_pipeline.py --file example.txt --model gpt-4-1106-preview
|
31 |
+
```
|
app.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# gradio_app.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from run_pipeline import get_fact_checked
|
5 |
+
|
6 |
+
|
7 |
+
def fact_check_function(text, model):
|
8 |
+
# Assume the text is already read from the user input, so we don't need to open a file here
|
9 |
+
out = get_fact_checked(text, mode="slow", model=model)
|
10 |
+
return out["fact_checked_md"]
|
11 |
+
|
12 |
+
|
13 |
+
def create_gradio_interface():
|
14 |
+
iface = gr.Interface(
|
15 |
+
allow_flagging=False,
|
16 |
+
fn=fact_check_function,
|
17 |
+
inputs=[
|
18 |
+
gr.Textbox(
|
19 |
+
lines=10, placeholder="Enter text to fact-check...", label="Input Text"
|
20 |
+
),
|
21 |
+
gr.Dropdown(choices=["gpt-4-1106-preview"], label="Model"),
|
22 |
+
],
|
23 |
+
outputs=gr.Markdown(label="Filtir Output"),
|
24 |
+
)
|
25 |
+
return iface
|
26 |
+
|
27 |
+
|
28 |
+
if __name__ == "__main__":
|
29 |
+
iface = create_gradio_interface()
|
30 |
+
iface.launch()
|
example.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Arendt fled Germany in 1933.
|
fetch_evidence.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import json
|
4 |
+
import json5
|
5 |
+
import argparse
|
6 |
+
from pathlib import Path
|
7 |
+
import multiprocessing as mp
|
8 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
9 |
+
from pipeline_paths import PIPELINE_PATHS
|
10 |
+
from datetime import datetime
|
11 |
+
import urllib.robotparser
|
12 |
+
import urllib.parse
|
13 |
+
from utils import get_google_search_results
|
14 |
+
|
15 |
+
import time
|
16 |
+
from random import randint
|
17 |
+
from fake_useragent import UserAgent
|
18 |
+
from newspaper import Article, Config
|
19 |
+
|
20 |
+
|
21 |
+
def can_scrape(url, user_agent="*"):
|
22 |
+
rp = urllib.robotparser.RobotFileParser()
|
23 |
+
rp.set_url(f"{url.scheme}://{url.netloc}/robots.txt")
|
24 |
+
# be conservative - if we can't find robots.txt, don't scrapes
|
25 |
+
try:
|
26 |
+
rp.read()
|
27 |
+
ok_to_scrape = rp.can_fetch(user_agent, url.geturl())
|
28 |
+
except urllib.error.URLError:
|
29 |
+
ok_to_scrape = False
|
30 |
+
return ok_to_scrape
|
31 |
+
|
32 |
+
|
33 |
+
def fetch_search_results_to_gather_evidence(
|
34 |
+
args,
|
35 |
+
idx: int,
|
36 |
+
total: int,
|
37 |
+
search_results_dest_path: Path,
|
38 |
+
queryset: dict,
|
39 |
+
):
|
40 |
+
user_agent = UserAgent()
|
41 |
+
config = Config()
|
42 |
+
config.fetch_images = False
|
43 |
+
print(f"Query {idx}/{total}")
|
44 |
+
|
45 |
+
search_results_dest_path.parent.mkdir(exist_ok=True, parents=True)
|
46 |
+
|
47 |
+
# check if we already have search_results for this title
|
48 |
+
if search_results_dest_path.exists() and not args.refresh:
|
49 |
+
print(f"Found existing search results at {search_results_dest_path}, skipping")
|
50 |
+
return 0
|
51 |
+
|
52 |
+
headers = {
|
53 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
|
54 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
55 |
+
"Accept-Language": "en-US,en;q=0.5",
|
56 |
+
"DNT": "1",
|
57 |
+
"Connection": "keep-alive",
|
58 |
+
"Upgrade-Insecure-Requests": "1",
|
59 |
+
}
|
60 |
+
|
61 |
+
# we assume some sites won't permit scraping, so we'll skip these
|
62 |
+
num_results = args.num_search_results_to_keep + 5
|
63 |
+
results = {}
|
64 |
+
|
65 |
+
for item in queryset:
|
66 |
+
if item["search_query"] == "no suitable query":
|
67 |
+
item["search_results"] = []
|
68 |
+
continue
|
69 |
+
|
70 |
+
search_results = get_google_search_results(
|
71 |
+
query_str=item["search_query"], num_results=num_results
|
72 |
+
)
|
73 |
+
|
74 |
+
if search_results == [{"Result": "No good Google Search Result was found"}]:
|
75 |
+
item["search_results"] = []
|
76 |
+
continue
|
77 |
+
|
78 |
+
parsed_results = []
|
79 |
+
for search_result in search_results:
|
80 |
+
if not can_scrape(
|
81 |
+
urllib.parse.urlparse(search_result["link"]), user_agent="MyScraper"
|
82 |
+
):
|
83 |
+
print(
|
84 |
+
f"Skipping {search_result['link']} because it doesn't permit scraping"
|
85 |
+
)
|
86 |
+
continue
|
87 |
+
try:
|
88 |
+
config.browser_user_agent = user_agent.random
|
89 |
+
article = Article(search_result["link"], language="en", config=config)
|
90 |
+
article.download()
|
91 |
+
article.parse()
|
92 |
+
text = article.text
|
93 |
+
except Exception as e:
|
94 |
+
print(f"Error parsing article: {e}, trying with requests.get...")
|
95 |
+
try:
|
96 |
+
response = requests.get(
|
97 |
+
search_result["link"], timeout=15, headers=headers
|
98 |
+
)
|
99 |
+
html = response.text
|
100 |
+
soup = BeautifulSoup(html, features="html.parser")
|
101 |
+
text = soup.get_text()
|
102 |
+
except Exception as exception:
|
103 |
+
print(f"Error parsing article: {exception}")
|
104 |
+
raise exception
|
105 |
+
|
106 |
+
search_result["text"] = text
|
107 |
+
parsed_results.append(search_result)
|
108 |
+
if len(parsed_results) == args.num_search_results_to_keep:
|
109 |
+
break
|
110 |
+
item["search_results"] = parsed_results
|
111 |
+
|
112 |
+
# update the queryset with new information
|
113 |
+
date_str = datetime.now().strftime("%Y-%m-%d")
|
114 |
+
results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}
|
115 |
+
|
116 |
+
print(
|
117 |
+
f"Writing web pages for search results for {len(queryset)} queries to {search_results_dest_path}"
|
118 |
+
)
|
119 |
+
with open(search_results_dest_path, "w") as f:
|
120 |
+
f.write(json.dumps(results, indent=4, sort_keys=True))
|
121 |
+
|
122 |
+
|
123 |
+
def main():
|
124 |
+
args = parse_args()
|
125 |
+
search_query_paths = list(
|
126 |
+
PIPELINE_PATHS["search_queries_for_evidence"].glob("**/*.json")
|
127 |
+
)
|
128 |
+
|
129 |
+
if args.limit:
|
130 |
+
print(f"Limited to {args.limit} search querysets")
|
131 |
+
search_query_paths = search_query_paths[: args.limit]
|
132 |
+
|
133 |
+
kwarg_list = []
|
134 |
+
for idx, search_query_path in enumerate(search_query_paths):
|
135 |
+
rel_path = search_query_path.relative_to(
|
136 |
+
PIPELINE_PATHS["search_queries_for_evidence"]
|
137 |
+
)
|
138 |
+
dest_path = PIPELINE_PATHS["google_search_results_evidence"] / rel_path
|
139 |
+
|
140 |
+
if dest_path.exists() and not args.refresh:
|
141 |
+
print(f"For {search_query_path}, found results at {dest_path}, skipping")
|
142 |
+
continue
|
143 |
+
|
144 |
+
with open(search_query_path, "r") as f:
|
145 |
+
queryset = json.load(f)
|
146 |
+
kwarg_list.append(
|
147 |
+
{
|
148 |
+
"idx": idx,
|
149 |
+
"total": len(search_query_paths),
|
150 |
+
"search_results_dest_path": dest_path,
|
151 |
+
"args": args,
|
152 |
+
"queryset": queryset,
|
153 |
+
}
|
154 |
+
)
|
155 |
+
|
156 |
+
# provide the total number of queries to each process
|
157 |
+
for kwargs in kwarg_list:
|
158 |
+
kwargs["total"] = len(kwarg_list)
|
159 |
+
|
160 |
+
# single process
|
161 |
+
if args.processes == 1:
|
162 |
+
cost = 0
|
163 |
+
for kwargs in kwarg_list:
|
164 |
+
fetch_search_results_to_gather_evidence(**kwargs)
|
165 |
+
else: # multiprocess
|
166 |
+
func = fetch_search_results_to_gather_evidence
|
167 |
+
with mp.Pool(processes=args.processes) as pool:
|
168 |
+
starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list)
|
169 |
+
|
170 |
+
|
171 |
+
def parse_args():
|
172 |
+
parser = argparse.ArgumentParser()
|
173 |
+
parser.add_argument(
|
174 |
+
"--model", default="gpt-3.5-turbo", choices=["gpt-4", "gpt-3.5-turbo"]
|
175 |
+
)
|
176 |
+
parser.add_argument("--limit", default=0, type=int)
|
177 |
+
parser.add_argument("--refresh", action="store_true")
|
178 |
+
parser.add_argument("--num_search_results_to_keep", type=int, default=3)
|
179 |
+
parser.add_argument("--processes", type=int, default=1)
|
180 |
+
return parser.parse_args()
|
181 |
+
|
182 |
+
|
183 |
+
if __name__ == "__main__":
|
184 |
+
main()
|
flagged/log.csv
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Input Text,Model,Fact Checked Output,flag,username,timestamp
|
2 |
+
,gpt-4-1106-preview,,,,2024-02-16 13:59:41.379589
|
3 |
+
The earth is flat,gpt-4-1106-preview,"The earth is flat[^The-earth-is-flat]
|
4 |
+
|
5 |
+
[^The-earth-is-flat]: ❗ Claim: The earth is flat. 👉 Unsupported ""Flat-Earth ideas are based on basic scientific misunderstandings that can be easily refuted."",""For most people, even those who have no physics background, the evidence for a spherical Earth is obvious."" The claim that the Earth is flat is directly contradicted by the evidence, which states that Flat-Earth ideas are based on basic scientific misunderstandings and that the evidence for a spherical Earth is obvious., URLs: https://physicsworld.com/a/fighting-flat-earth-theory/, date accessed: 2024-02-16 ",,,2024-02-16 14:00:40.524144
|
generate_search_queries.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from pathlib import Path
|
3 |
+
import numpy as np
|
4 |
+
from pipeline_paths import PIPELINE_PATHS
|
5 |
+
import json
|
6 |
+
from zsvision.zs_utils import BlockTimer
|
7 |
+
from typing import Dict, List
|
8 |
+
from llm_api_utils import (
|
9 |
+
call_openai_with_exponetial_backoff,
|
10 |
+
estimate_cost_of_text_generation_api_call,
|
11 |
+
init_openai_with_api_key,
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
def generate_search_queries(args, src_path: Path, dest_path: Path):
|
16 |
+
"""
|
17 |
+
Generate a search query that can be used to verify a claim.
|
18 |
+
"""
|
19 |
+
init_openai_with_api_key(api_key_path=args.api_key_path)
|
20 |
+
with open(src_path, "r") as f:
|
21 |
+
claims_and_sources = json.load(f)
|
22 |
+
|
23 |
+
# exclude subjective claims
|
24 |
+
original_num_claims = len(claims_and_sources)
|
25 |
+
claims_and_sources = [
|
26 |
+
claim_and_source
|
27 |
+
for claim_and_source in claims_and_sources
|
28 |
+
if claim_and_source["label"] == "objective"
|
29 |
+
]
|
30 |
+
num_claims = len(claims_and_sources)
|
31 |
+
print(
|
32 |
+
f"Filtered from {original_num_claims} claims to {num_claims} objective claims"
|
33 |
+
)
|
34 |
+
|
35 |
+
# we limit the number of claims per api call (otherwise GPT-4 can choke)
|
36 |
+
num_batches = int(np.ceil(num_claims / args.max_claims_per_api_call))
|
37 |
+
claims_and_sources_batches = [
|
38 |
+
batch.tolist() for batch in np.array_split(claims_and_sources, num_batches)
|
39 |
+
]
|
40 |
+
queries = []
|
41 |
+
|
42 |
+
all_claims_str = "\n".join([claim["claim"] for claim in claims_and_sources])
|
43 |
+
|
44 |
+
for idx, claims_and_sources_batch in enumerate(claims_and_sources_batches):
|
45 |
+
print(
|
46 |
+
f"Processing batch {idx+1} of {len(claims_and_sources_batches)} (containing {len(claims_and_sources_batch)} claims)"
|
47 |
+
)
|
48 |
+
|
49 |
+
claim_str = "\n".join([claim["claim"] for claim in claims_and_sources_batch])
|
50 |
+
num_batch_claims = len(claims_and_sources_batch)
|
51 |
+
|
52 |
+
# we provide the full list of claims as context (to help resolve ambiguity), but only ask for queries for the current batch
|
53 |
+
prompt = f"""\
|
54 |
+
You are working as part of a team and your individual task is to help check a subset of the following claims:\n
|
55 |
+
{all_claims_str}
|
56 |
+
|
57 |
+
Your individual task is as follows. \
|
58 |
+
For each of the {num_batch_claims} claims made below, provide a suitable Google search query that would enable a human to verify the claim. \
|
59 |
+
Note that Google can perform calculations and conversions, so you can use it to check numerical claims. \
|
60 |
+
If you think no Google query will be useful, then write "no suitable query". \
|
61 |
+
Each proposed Google search query should be on a separate line (do not prefix your queries with bullet points or numbers). \
|
62 |
+
There should be {num_batch_claims} queries in total.\n \
|
63 |
+
|
64 |
+
{claim_str}
|
65 |
+
"""
|
66 |
+
persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
|
67 |
+
system_message = {"role": "system", "content": persona}
|
68 |
+
user_message = {"role": "user", "content": prompt}
|
69 |
+
messages = [system_message, user_message]
|
70 |
+
|
71 |
+
with BlockTimer(f"Using OpenAI API to extract claims with {args.model}"):
|
72 |
+
response = call_openai_with_exponetial_backoff(
|
73 |
+
model=args.model,
|
74 |
+
temperature=args.temperature,
|
75 |
+
messages=messages,
|
76 |
+
)
|
77 |
+
|
78 |
+
cost = estimate_cost_of_text_generation_api_call(
|
79 |
+
model=args.model, response=response, verbose=True
|
80 |
+
)
|
81 |
+
|
82 |
+
proposed_queries = response.choices[0].message.content
|
83 |
+
batch_queries = proposed_queries.split("\n")
|
84 |
+
assert (
|
85 |
+
len(batch_queries) == num_batch_claims
|
86 |
+
), f"Expected {num_batch_claims} queries, but got {len(queries)}"
|
87 |
+
print(f"Generated {len(batch_queries)} queries (cost: {cost:.4f} USD)")
|
88 |
+
queries.extend(batch_queries)
|
89 |
+
|
90 |
+
querysets = []
|
91 |
+
for claim_and_source, query in zip(claims_and_sources, queries):
|
92 |
+
queryset = {**claim_and_source, "search_query": query}
|
93 |
+
querysets.append(queryset)
|
94 |
+
|
95 |
+
dest_path.parent.mkdir(exist_ok=True, parents=True)
|
96 |
+
with open(dest_path, "w") as f:
|
97 |
+
json.dump(querysets, f, indent=4, sort_keys=True)
|
98 |
+
|
99 |
+
|
100 |
+
def main():
|
101 |
+
args = parse_args()
|
102 |
+
|
103 |
+
src_paths = list(
|
104 |
+
PIPELINE_PATHS["extracted_claims_with_classifications_dir"].glob("**/*.json")
|
105 |
+
)
|
106 |
+
print(
|
107 |
+
f"Found {len(src_paths)} claim files in {PIPELINE_PATHS['extracted_claims_with_classifications_dir']}"
|
108 |
+
)
|
109 |
+
dest_dir = PIPELINE_PATHS["search_queries_for_evidence"]
|
110 |
+
|
111 |
+
for src_path in src_paths:
|
112 |
+
dest_path = dest_dir / src_path.relative_to(
|
113 |
+
PIPELINE_PATHS["extracted_claims_with_classifications_dir"]
|
114 |
+
)
|
115 |
+
if not dest_path.exists() or args.refresh:
|
116 |
+
generate_search_queries(
|
117 |
+
args=args,
|
118 |
+
src_path=src_path,
|
119 |
+
dest_path=dest_path,
|
120 |
+
)
|
121 |
+
|
122 |
+
|
123 |
+
def parse_args():
|
124 |
+
parser = argparse.ArgumentParser()
|
125 |
+
parser.add_argument("--temperature", type=float, default=0)
|
126 |
+
parser.add_argument(
|
127 |
+
"--model", default="gpt-3.5-turbo", choices=["gpt-4", "gpt-3.5-turbo"]
|
128 |
+
)
|
129 |
+
parser.add_argument("--dest_dir", default="data/search_queries", type=Path)
|
130 |
+
parser.add_argument("--api_key_path", default="OPENAI_API_KEY.txt")
|
131 |
+
parser.add_argument("--max_claims_per_api_call", type=int, default=10)
|
132 |
+
parser.add_argument("--refresh", action="store_true")
|
133 |
+
return parser.parse_args()
|
134 |
+
|
135 |
+
|
136 |
+
if __name__ == "__main__":
|
137 |
+
main()
|
llm_api_utils.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import cohere
|
3 |
+
import os
|
4 |
+
import openai
|
5 |
+
import time
|
6 |
+
import backoff
|
7 |
+
|
8 |
+
|
9 |
+
PRICE_PER_1K_TOKENS = {
|
10 |
+
"gpt-4": {"prompt": 0.03, "completion": 0.06},
|
11 |
+
"gpt-4-1106-preview": {"prompt": 0.01, "completion": 0.03},
|
12 |
+
"gpt-3.5-turbo": {"prompt": 0.001, "completion": 0.002},
|
13 |
+
"ada": {"embed": 0.0004},
|
14 |
+
"text-embedding-ada-002": {"embed": 0.0001},
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
EMBEDDING_DIMENSIONS = {
|
19 |
+
"ada": 1536,
|
20 |
+
"text-embedding-ada-002": 1536,
|
21 |
+
}
|
22 |
+
|
23 |
+
|
24 |
+
def estimate_cost_of_text_generation_api_call(
|
25 |
+
model: str, response: dict, verbose: bool
|
26 |
+
) -> float:
|
27 |
+
completion_tokens = response.usage.completion_tokens
|
28 |
+
prompt_tokens = response.usage.prompt_tokens
|
29 |
+
total_tokens = response.usage.total_tokens
|
30 |
+
|
31 |
+
prompt_cost = prompt_tokens / 1000 * PRICE_PER_1K_TOKENS[model]["prompt"]
|
32 |
+
completion_cost = (
|
33 |
+
completion_tokens / 1000 * PRICE_PER_1K_TOKENS[model]["completion"]
|
34 |
+
)
|
35 |
+
cost = prompt_cost + completion_cost
|
36 |
+
|
37 |
+
if verbose:
|
38 |
+
summary = f"""\
|
39 |
+
Used {prompt_tokens} prompt tokens, {completion_tokens} completion tokens, {total_tokens} total tokens
|
40 |
+
Esimated cost: {cost:.4f} USD
|
41 |
+
"""
|
42 |
+
print(summary)
|
43 |
+
return cost
|
44 |
+
|
45 |
+
|
46 |
+
@backoff.on_exception(backoff.expo, (openai.RateLimitError, openai.APIConnectionError))
|
47 |
+
def call_openai_with_exponetial_backoff(**kwargs):
|
48 |
+
rand_sleep_in_secs = 5 * random.random()
|
49 |
+
time.sleep(rand_sleep_in_secs)
|
50 |
+
return openai.chat.completions.create(**kwargs)
|
51 |
+
|
52 |
+
|
53 |
+
def init_openai_with_api_key():
|
54 |
+
openai.api_key = os.environ.get("OPENAI_API_KEY")
|
55 |
+
|
56 |
+
|
57 |
+
def init_cohere_with_api_key():
|
58 |
+
COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
|
59 |
+
co = cohere.Client(COHERE_API_KEY)
|
60 |
+
return co
|
objective_claims.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Hannah Arendt was born in 1906.
|
2 |
+
Hannah Arendt was raised in a Jewish family.
|
3 |
+
Arendt fled Germany in 1933.
|
pipeline_paths.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""A module that contains the paths to the various files and folders used in the pipeline."""
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
PIPELINE_PATHS = {
|
5 |
+
# google queries
|
6 |
+
"google_custom_search_engine_id_path": "google_custom_search_engine_id.txt",
|
7 |
+
# raw inputs to pipeline
|
8 |
+
"source_document_dir": "data/source_documents",
|
9 |
+
# claim extraction
|
10 |
+
"extracted_claims_dir": "data/extracted_claims",
|
11 |
+
"extracted_claims_with_anchor_fixes_dir": "data/extracted_claims_with_anchor_fixes",
|
12 |
+
"extracted_claims_with_classifications_dir": "data/extracted_with_classifications_claims",
|
13 |
+
"objective_claims_dir": "data/objective_claims",
|
14 |
+
# evidence gathering
|
15 |
+
"cohere_wikipedia_evidence": "data/evidence_gathering/cohere_wikipedia",
|
16 |
+
"google_search_results_evidence": "data/evidence_gathering/google_search_results",
|
17 |
+
"faiss_db_embeddings_for_evidence": "data/faiss_db_embeddings_for_evidence",
|
18 |
+
"web_evidence_chunks": "data/evidence_gathering/web_evidence_chunks",
|
19 |
+
# claim evaluation
|
20 |
+
"evaluated_claims_dir": "data/claim_evaluation/claim_verdicts",
|
21 |
+
# reformatted document
|
22 |
+
"fact_checked_document_dir": "data/fact_checked_documents",
|
23 |
+
}
|
24 |
+
|
25 |
+
PIPELINE_PATHS = {key: Path(value) for key, value in PIPELINE_PATHS.items()}
|
requirements.in
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
zsvision
|
2 |
+
openai
|
3 |
+
cohere
|
4 |
+
ipdb
|
5 |
+
weaviate-client
|
6 |
+
bs4
|
7 |
+
json5
|
8 |
+
fake-useragent
|
9 |
+
newspaper3k
|
10 |
+
google-api-python-client
|
11 |
+
faiss-cpu
|
12 |
+
tiktoken
|
13 |
+
langchain
|
14 |
+
python-dotenv
|
15 |
+
langchain-community
|
16 |
+
langchain-openai
|
17 |
+
pip-tools
|
18 |
+
gradio
|
requirements.txt
ADDED
@@ -0,0 +1,508 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# This file is autogenerated by pip-compile with Python 3.11
|
3 |
+
# by the following command:
|
4 |
+
#
|
5 |
+
# pip-compile requirements.in
|
6 |
+
#
|
7 |
+
aiofiles==23.2.1
|
8 |
+
# via gradio
|
9 |
+
aiohttp==3.9.3
|
10 |
+
# via
|
11 |
+
# cohere
|
12 |
+
# langchain
|
13 |
+
# langchain-community
|
14 |
+
aiosignal==1.3.1
|
15 |
+
# via aiohttp
|
16 |
+
altair==5.2.0
|
17 |
+
# via gradio
|
18 |
+
annotated-types==0.6.0
|
19 |
+
# via pydantic
|
20 |
+
anyio==4.2.0
|
21 |
+
# via
|
22 |
+
# httpx
|
23 |
+
# langchain-core
|
24 |
+
# openai
|
25 |
+
# starlette
|
26 |
+
asttokens==2.4.1
|
27 |
+
# via stack-data
|
28 |
+
attrs==23.2.0
|
29 |
+
# via
|
30 |
+
# aiohttp
|
31 |
+
# jsonschema
|
32 |
+
# referencing
|
33 |
+
authlib==1.3.0
|
34 |
+
# via weaviate-client
|
35 |
+
backoff==2.2.1
|
36 |
+
# via cohere
|
37 |
+
beartype==0.17.2
|
38 |
+
# via zsvision
|
39 |
+
beautifulsoup4==4.12.3
|
40 |
+
# via
|
41 |
+
# bs4
|
42 |
+
# feedfinder2
|
43 |
+
# newspaper3k
|
44 |
+
bs4==0.0.2
|
45 |
+
# via -r requirements.in
|
46 |
+
build==1.0.3
|
47 |
+
# via pip-tools
|
48 |
+
cachetools==5.3.2
|
49 |
+
# via google-auth
|
50 |
+
certifi==2024.2.2
|
51 |
+
# via
|
52 |
+
# httpcore
|
53 |
+
# httpx
|
54 |
+
# requests
|
55 |
+
cffi==1.16.0
|
56 |
+
# via cryptography
|
57 |
+
charset-normalizer==3.3.2
|
58 |
+
# via requests
|
59 |
+
click==8.1.7
|
60 |
+
# via
|
61 |
+
# nltk
|
62 |
+
# pip-tools
|
63 |
+
# typer
|
64 |
+
# uvicorn
|
65 |
+
cohere==4.47
|
66 |
+
# via -r requirements.in
|
67 |
+
colorama==0.4.6
|
68 |
+
# via typer
|
69 |
+
contourpy==1.2.0
|
70 |
+
# via matplotlib
|
71 |
+
cryptography==42.0.2
|
72 |
+
# via authlib
|
73 |
+
cssselect==1.2.0
|
74 |
+
# via newspaper3k
|
75 |
+
cycler==0.12.1
|
76 |
+
# via matplotlib
|
77 |
+
dataclasses-json==0.6.4
|
78 |
+
# via
|
79 |
+
# langchain
|
80 |
+
# langchain-community
|
81 |
+
decorator==5.1.1
|
82 |
+
# via
|
83 |
+
# ipdb
|
84 |
+
# ipython
|
85 |
+
distro==1.9.0
|
86 |
+
# via openai
|
87 |
+
executing==2.0.1
|
88 |
+
# via stack-data
|
89 |
+
faiss-cpu==1.7.4
|
90 |
+
# via -r requirements.in
|
91 |
+
fake-useragent==1.4.0
|
92 |
+
# via -r requirements.in
|
93 |
+
fastapi==0.109.2
|
94 |
+
# via gradio
|
95 |
+
fastavro==1.9.4
|
96 |
+
# via cohere
|
97 |
+
feedfinder2==0.0.4
|
98 |
+
# via newspaper3k
|
99 |
+
feedparser==6.0.11
|
100 |
+
# via newspaper3k
|
101 |
+
ffmpy==0.3.2
|
102 |
+
# via gradio
|
103 |
+
filelock==3.13.1
|
104 |
+
# via
|
105 |
+
# huggingface-hub
|
106 |
+
# tldextract
|
107 |
+
fonttools==4.48.1
|
108 |
+
# via matplotlib
|
109 |
+
frozenlist==1.4.1
|
110 |
+
# via
|
111 |
+
# aiohttp
|
112 |
+
# aiosignal
|
113 |
+
fsspec==2024.2.0
|
114 |
+
# via
|
115 |
+
# gradio-client
|
116 |
+
# huggingface-hub
|
117 |
+
google-api-core==2.17.0
|
118 |
+
# via google-api-python-client
|
119 |
+
google-api-python-client==2.118.0
|
120 |
+
# via -r requirements.in
|
121 |
+
google-auth==2.27.0
|
122 |
+
# via
|
123 |
+
# google-api-core
|
124 |
+
# google-api-python-client
|
125 |
+
# google-auth-httplib2
|
126 |
+
google-auth-httplib2==0.2.0
|
127 |
+
# via google-api-python-client
|
128 |
+
googleapis-common-protos==1.62.0
|
129 |
+
# via google-api-core
|
130 |
+
gradio==4.19.0
|
131 |
+
# via -r requirements.in
|
132 |
+
gradio-client==0.10.0
|
133 |
+
# via gradio
|
134 |
+
grpcio==1.60.1
|
135 |
+
# via
|
136 |
+
# grpcio-health-checking
|
137 |
+
# grpcio-tools
|
138 |
+
# weaviate-client
|
139 |
+
grpcio-health-checking==1.60.1
|
140 |
+
# via weaviate-client
|
141 |
+
grpcio-tools==1.60.1
|
142 |
+
# via weaviate-client
|
143 |
+
h11==0.14.0
|
144 |
+
# via
|
145 |
+
# httpcore
|
146 |
+
# uvicorn
|
147 |
+
h5py==3.10.0
|
148 |
+
# via hickle
|
149 |
+
hickle==5.0.2
|
150 |
+
# via zsvision
|
151 |
+
httpcore==1.0.2
|
152 |
+
# via httpx
|
153 |
+
httplib2==0.22.0
|
154 |
+
# via
|
155 |
+
# google-api-python-client
|
156 |
+
# google-auth-httplib2
|
157 |
+
httpx==0.26.0
|
158 |
+
# via
|
159 |
+
# gradio
|
160 |
+
# gradio-client
|
161 |
+
# openai
|
162 |
+
# weaviate-client
|
163 |
+
huggingface-hub==0.20.3
|
164 |
+
# via
|
165 |
+
# gradio
|
166 |
+
# gradio-client
|
167 |
+
humanize==4.9.0
|
168 |
+
# via zsvision
|
169 |
+
idna==3.6
|
170 |
+
# via
|
171 |
+
# anyio
|
172 |
+
# httpx
|
173 |
+
# requests
|
174 |
+
# tldextract
|
175 |
+
# yarl
|
176 |
+
importlib-metadata==6.11.0
|
177 |
+
# via cohere
|
178 |
+
importlib-resources==6.1.1
|
179 |
+
# via gradio
|
180 |
+
ipdb==0.13.13
|
181 |
+
# via -r requirements.in
|
182 |
+
ipython==8.21.0
|
183 |
+
# via ipdb
|
184 |
+
jedi==0.19.1
|
185 |
+
# via ipython
|
186 |
+
jieba3k==0.35.1
|
187 |
+
# via newspaper3k
|
188 |
+
jinja2==3.1.3
|
189 |
+
# via
|
190 |
+
# altair
|
191 |
+
# gradio
|
192 |
+
joblib==1.3.2
|
193 |
+
# via nltk
|
194 |
+
json5==0.9.14
|
195 |
+
# via -r requirements.in
|
196 |
+
jsonpatch==1.33
|
197 |
+
# via
|
198 |
+
# langchain
|
199 |
+
# langchain-core
|
200 |
+
jsonpointer==2.4
|
201 |
+
# via jsonpatch
|
202 |
+
jsonschema==4.21.1
|
203 |
+
# via altair
|
204 |
+
jsonschema-specifications==2023.12.1
|
205 |
+
# via jsonschema
|
206 |
+
kiwisolver==1.4.5
|
207 |
+
# via matplotlib
|
208 |
+
langchain==0.1.7
|
209 |
+
# via -r requirements.in
|
210 |
+
langchain-community==0.0.20
|
211 |
+
# via
|
212 |
+
# -r requirements.in
|
213 |
+
# langchain
|
214 |
+
langchain-core==0.1.23
|
215 |
+
# via
|
216 |
+
# langchain
|
217 |
+
# langchain-community
|
218 |
+
# langchain-openai
|
219 |
+
langchain-openai==0.0.6
|
220 |
+
# via -r requirements.in
|
221 |
+
langsmith==0.0.87
|
222 |
+
# via
|
223 |
+
# langchain
|
224 |
+
# langchain-community
|
225 |
+
# langchain-core
|
226 |
+
lxml==5.1.0
|
227 |
+
# via newspaper3k
|
228 |
+
markdown-it-py==3.0.0
|
229 |
+
# via rich
|
230 |
+
markupsafe==2.1.5
|
231 |
+
# via
|
232 |
+
# gradio
|
233 |
+
# jinja2
|
234 |
+
marshmallow==3.20.2
|
235 |
+
# via dataclasses-json
|
236 |
+
matplotlib==3.8.2
|
237 |
+
# via
|
238 |
+
# gradio
|
239 |
+
# zsvision
|
240 |
+
matplotlib-inline==0.1.6
|
241 |
+
# via ipython
|
242 |
+
mdurl==0.1.2
|
243 |
+
# via markdown-it-py
|
244 |
+
mergedeep==1.3.4
|
245 |
+
# via zsvision
|
246 |
+
msgpack==1.0.7
|
247 |
+
# via
|
248 |
+
# msgpack-numpy
|
249 |
+
# zsvision
|
250 |
+
msgpack-numpy==0.4.8
|
251 |
+
# via zsvision
|
252 |
+
multidict==6.0.5
|
253 |
+
# via
|
254 |
+
# aiohttp
|
255 |
+
# yarl
|
256 |
+
mypy-extensions==1.0.0
|
257 |
+
# via typing-inspect
|
258 |
+
newspaper3k==0.2.8
|
259 |
+
# via -r requirements.in
|
260 |
+
nltk==3.8.1
|
261 |
+
# via newspaper3k
|
262 |
+
numpy==1.26.4
|
263 |
+
# via
|
264 |
+
# altair
|
265 |
+
# contourpy
|
266 |
+
# gradio
|
267 |
+
# h5py
|
268 |
+
# hickle
|
269 |
+
# langchain
|
270 |
+
# langchain-community
|
271 |
+
# langchain-openai
|
272 |
+
# matplotlib
|
273 |
+
# msgpack-numpy
|
274 |
+
# pandas
|
275 |
+
# scipy
|
276 |
+
# zsvision
|
277 |
+
openai==1.12.0
|
278 |
+
# via
|
279 |
+
# -r requirements.in
|
280 |
+
# langchain-openai
|
281 |
+
orjson==3.9.14
|
282 |
+
# via gradio
|
283 |
+
packaging==23.2
|
284 |
+
# via
|
285 |
+
# altair
|
286 |
+
# build
|
287 |
+
# gradio
|
288 |
+
# gradio-client
|
289 |
+
# huggingface-hub
|
290 |
+
# langchain-core
|
291 |
+
# marshmallow
|
292 |
+
# matplotlib
|
293 |
+
pandas==2.2.0
|
294 |
+
# via
|
295 |
+
# altair
|
296 |
+
# gradio
|
297 |
+
parso==0.8.3
|
298 |
+
# via jedi
|
299 |
+
pexpect==4.9.0
|
300 |
+
# via ipython
|
301 |
+
pillow==10.2.0
|
302 |
+
# via
|
303 |
+
# gradio
|
304 |
+
# matplotlib
|
305 |
+
# newspaper3k
|
306 |
+
pip-tools==7.3.0
|
307 |
+
# via -r requirements.in
|
308 |
+
prompt-toolkit==3.0.43
|
309 |
+
# via ipython
|
310 |
+
protobuf==4.25.2
|
311 |
+
# via
|
312 |
+
# google-api-core
|
313 |
+
# googleapis-common-protos
|
314 |
+
# grpcio-health-checking
|
315 |
+
# grpcio-tools
|
316 |
+
ptyprocess==0.7.0
|
317 |
+
# via pexpect
|
318 |
+
pure-eval==0.2.2
|
319 |
+
# via stack-data
|
320 |
+
pyasn1==0.5.1
|
321 |
+
# via
|
322 |
+
# pyasn1-modules
|
323 |
+
# rsa
|
324 |
+
pyasn1-modules==0.3.0
|
325 |
+
# via google-auth
|
326 |
+
pycparser==2.21
|
327 |
+
# via cffi
|
328 |
+
pydantic==2.6.1
|
329 |
+
# via
|
330 |
+
# fastapi
|
331 |
+
# gradio
|
332 |
+
# langchain
|
333 |
+
# langchain-core
|
334 |
+
# langsmith
|
335 |
+
# openai
|
336 |
+
# weaviate-client
|
337 |
+
pydantic-core==2.16.2
|
338 |
+
# via pydantic
|
339 |
+
pydub==0.25.1
|
340 |
+
# via gradio
|
341 |
+
pygments==2.17.2
|
342 |
+
# via
|
343 |
+
# ipython
|
344 |
+
# rich
|
345 |
+
pyparsing==3.1.1
|
346 |
+
# via
|
347 |
+
# httplib2
|
348 |
+
# matplotlib
|
349 |
+
pyproject-hooks==1.0.0
|
350 |
+
# via build
|
351 |
+
python-dateutil==2.8.2
|
352 |
+
# via
|
353 |
+
# matplotlib
|
354 |
+
# newspaper3k
|
355 |
+
# pandas
|
356 |
+
python-dotenv==1.0.1
|
357 |
+
# via -r requirements.in
|
358 |
+
python-multipart==0.0.9
|
359 |
+
# via gradio
|
360 |
+
pytz==2024.1
|
361 |
+
# via pandas
|
362 |
+
pyyaml==6.0.1
|
363 |
+
# via
|
364 |
+
# gradio
|
365 |
+
# huggingface-hub
|
366 |
+
# langchain
|
367 |
+
# langchain-community
|
368 |
+
# langchain-core
|
369 |
+
# newspaper3k
|
370 |
+
# zsvision
|
371 |
+
referencing==0.33.0
|
372 |
+
# via
|
373 |
+
# jsonschema
|
374 |
+
# jsonschema-specifications
|
375 |
+
regex==2023.12.25
|
376 |
+
# via
|
377 |
+
# nltk
|
378 |
+
# tiktoken
|
379 |
+
requests==2.31.0
|
380 |
+
# via
|
381 |
+
# cohere
|
382 |
+
# feedfinder2
|
383 |
+
# google-api-core
|
384 |
+
# huggingface-hub
|
385 |
+
# langchain
|
386 |
+
# langchain-community
|
387 |
+
# langchain-core
|
388 |
+
# langsmith
|
389 |
+
# newspaper3k
|
390 |
+
# requests-file
|
391 |
+
# tiktoken
|
392 |
+
# tldextract
|
393 |
+
# weaviate-client
|
394 |
+
requests-file==2.0.0
|
395 |
+
# via tldextract
|
396 |
+
rich==13.7.0
|
397 |
+
# via typer
|
398 |
+
rpds-py==0.18.0
|
399 |
+
# via
|
400 |
+
# jsonschema
|
401 |
+
# referencing
|
402 |
+
rsa==4.9
|
403 |
+
# via google-auth
|
404 |
+
ruff==0.2.1
|
405 |
+
# via gradio
|
406 |
+
scipy==1.12.0
|
407 |
+
# via zsvision
|
408 |
+
semantic-version==2.10.0
|
409 |
+
# via gradio
|
410 |
+
sgmllib3k==1.0.0
|
411 |
+
# via feedparser
|
412 |
+
shellingham==1.5.4
|
413 |
+
# via typer
|
414 |
+
six==1.16.0
|
415 |
+
# via
|
416 |
+
# asttokens
|
417 |
+
# feedfinder2
|
418 |
+
# python-dateutil
|
419 |
+
sniffio==1.3.0
|
420 |
+
# via
|
421 |
+
# anyio
|
422 |
+
# httpx
|
423 |
+
# openai
|
424 |
+
soupsieve==2.5
|
425 |
+
# via beautifulsoup4
|
426 |
+
sqlalchemy==2.0.27
|
427 |
+
# via
|
428 |
+
# langchain
|
429 |
+
# langchain-community
|
430 |
+
stack-data==0.6.3
|
431 |
+
# via ipython
|
432 |
+
starlette==0.36.3
|
433 |
+
# via fastapi
|
434 |
+
tenacity==8.2.3
|
435 |
+
# via
|
436 |
+
# langchain
|
437 |
+
# langchain-community
|
438 |
+
# langchain-core
|
439 |
+
tiktoken==0.6.0
|
440 |
+
# via
|
441 |
+
# -r requirements.in
|
442 |
+
# langchain-openai
|
443 |
+
tinysegmenter==0.3
|
444 |
+
# via newspaper3k
|
445 |
+
tldextract==5.1.1
|
446 |
+
# via newspaper3k
|
447 |
+
tomlkit==0.12.0
|
448 |
+
# via gradio
|
449 |
+
toolz==0.12.1
|
450 |
+
# via altair
|
451 |
+
tqdm==4.66.2
|
452 |
+
# via
|
453 |
+
# huggingface-hub
|
454 |
+
# nltk
|
455 |
+
# openai
|
456 |
+
traitlets==5.14.1
|
457 |
+
# via
|
458 |
+
# ipython
|
459 |
+
# matplotlib-inline
|
460 |
+
typeguard==4.1.5
|
461 |
+
# via zsvision
|
462 |
+
typer[all]==0.9.0
|
463 |
+
# via gradio
|
464 |
+
typing-extensions==4.9.0
|
465 |
+
# via
|
466 |
+
# fastapi
|
467 |
+
# gradio
|
468 |
+
# gradio-client
|
469 |
+
# huggingface-hub
|
470 |
+
# openai
|
471 |
+
# pydantic
|
472 |
+
# pydantic-core
|
473 |
+
# sqlalchemy
|
474 |
+
# typeguard
|
475 |
+
# typer
|
476 |
+
# typing-inspect
|
477 |
+
typing-inspect==0.9.0
|
478 |
+
# via dataclasses-json
|
479 |
+
tzdata==2024.1
|
480 |
+
# via pandas
|
481 |
+
uritemplate==4.1.1
|
482 |
+
# via google-api-python-client
|
483 |
+
urllib3==2.2.0
|
484 |
+
# via
|
485 |
+
# cohere
|
486 |
+
# requests
|
487 |
+
uvicorn==0.27.1
|
488 |
+
# via gradio
|
489 |
+
validators==0.22.0
|
490 |
+
# via weaviate-client
|
491 |
+
wcwidth==0.2.13
|
492 |
+
# via prompt-toolkit
|
493 |
+
weaviate-client==4.4.4
|
494 |
+
# via -r requirements.in
|
495 |
+
websockets==11.0.3
|
496 |
+
# via gradio-client
|
497 |
+
wheel==0.42.0
|
498 |
+
# via pip-tools
|
499 |
+
yarl==1.9.4
|
500 |
+
# via aiohttp
|
501 |
+
zipp==3.17.0
|
502 |
+
# via importlib-metadata
|
503 |
+
zsvision==0.7.12
|
504 |
+
# via -r requirements.in
|
505 |
+
|
506 |
+
# The following packages are considered to be unsafe in a requirements file:
|
507 |
+
# pip
|
508 |
+
# setuptools
|
run_pipeline.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from step1_api_claim_extractor import ClaimExtractor
|
2 |
+
from step2_api_fix_passage_anchors import FixAnchors
|
3 |
+
from step3_api_identify_objective_claims import ClassifyClaims
|
4 |
+
from step41_api_fetch_cohere_wikipedia_evidence import CohereEvidence
|
5 |
+
from step42_api_fetch_google_search_evidence import GoogleEvidence
|
6 |
+
from step5_api_embed_search_results import EmbedResults
|
7 |
+
from step6_api_claims_to_evidence import ClaimToEvidence
|
8 |
+
from step7_api_check_claims_against_evidence import CheckClaimAgainstEvidence
|
9 |
+
from step8_api_format_fact_checked_document import FormatDocument
|
10 |
+
|
11 |
+
import argparse
|
12 |
+
import json
|
13 |
+
import os
|
14 |
+
import copy
|
15 |
+
from dotenv import load_dotenv
|
16 |
+
|
17 |
+
load_dotenv()
|
18 |
+
|
19 |
+
|
20 |
+
def get_fact_checked(text_input, model="gpt-3.5-turbo", mode="slow"):
|
21 |
+
text_input = text_input.strip()
|
22 |
+
|
23 |
+
results = {}
|
24 |
+
|
25 |
+
# STEP1
|
26 |
+
print("Step1: Extracting claims")
|
27 |
+
step1 = ClaimExtractor(model=model)
|
28 |
+
step1_json = step1.extract_claims(text_input)
|
29 |
+
results["step1_claims"] = copy.deepcopy(step1_json)
|
30 |
+
|
31 |
+
# STEP2
|
32 |
+
print("Step2: Anchoring claims")
|
33 |
+
try:
|
34 |
+
step2 = FixAnchors(model=model)
|
35 |
+
step2_json = step2.fix_passage_anchors(step1_json, text_input)
|
36 |
+
except:
|
37 |
+
if model != "gpt-4":
|
38 |
+
print("Step2 failed with gpt-3.5, trying with gpt-4!")
|
39 |
+
step2 = FixAnchors(model="gpt-4")
|
40 |
+
step2_json = step2.fix_passage_anchors(step1_json, text_input)
|
41 |
+
results["step2_anchored_claims"] = copy.deepcopy(step2_json)
|
42 |
+
|
43 |
+
# STEP3
|
44 |
+
print("Step3: Classifying claims")
|
45 |
+
step3 = ClassifyClaims(model=model)
|
46 |
+
step3_json = step3.classify_claims(step2_json)
|
47 |
+
step3_filter = step3.filter_to_objective_claims(step3_json)
|
48 |
+
results["step3_classify_claims"] = copy.deepcopy(step3_json)
|
49 |
+
results["step3_objective_claims"] = copy.deepcopy(step3_filter)
|
50 |
+
|
51 |
+
if len(step3_filter) == 0:
|
52 |
+
return {"fact_checked_md": "No objective claims found!"}
|
53 |
+
|
54 |
+
# STEP4.1
|
55 |
+
print("Step4.1: Gathering evidence")
|
56 |
+
step4_cohere = CohereEvidence()
|
57 |
+
step4_json_cohere = (
|
58 |
+
step4_cohere.fetch_cohere_semantic_search_results_to_gather_evidence(
|
59 |
+
step3_filter
|
60 |
+
)
|
61 |
+
)
|
62 |
+
results["step41_cohere_evidence"] = copy.deepcopy(step4_json_cohere)
|
63 |
+
|
64 |
+
# STEP4.2
|
65 |
+
print("Step4.2: Gathering evidence")
|
66 |
+
step4_json_google = None
|
67 |
+
if mode == "slow":
|
68 |
+
step4_json_google = ""
|
69 |
+
try:
|
70 |
+
step4_google = GoogleEvidence(model=model)
|
71 |
+
step4_json_google = step4_google.fetch_search_results_to_gather_evidence(
|
72 |
+
step3_filter
|
73 |
+
)
|
74 |
+
except Exception as e:
|
75 |
+
print(f"Google search failed: {e}")
|
76 |
+
pass
|
77 |
+
results["step42_google_evidence"] = copy.deepcopy(step4_json_google)
|
78 |
+
|
79 |
+
embedding_model = "text-embedding-ada-002"
|
80 |
+
text_embedding_chunk_size = 500
|
81 |
+
|
82 |
+
srcs = [step4_json_cohere]
|
83 |
+
if step4_json_google:
|
84 |
+
srcs.append(step4_json_google)
|
85 |
+
|
86 |
+
# STEP 5
|
87 |
+
print("Step5: Embedding evidence")
|
88 |
+
step5 = EmbedResults(
|
89 |
+
embedding_model=embedding_model,
|
90 |
+
text_embedding_chunk_size=text_embedding_chunk_size,
|
91 |
+
)
|
92 |
+
faiss_db = step5.embed_for_uuid(srcs)
|
93 |
+
|
94 |
+
# STEP 6
|
95 |
+
print("Step6: Linking claims to evidence")
|
96 |
+
step6 = ClaimToEvidence()
|
97 |
+
step6_json = step6.link_claims_to_evidence(step3_filter, faiss_db)
|
98 |
+
results["step6_claim_to_evidence"] = copy.deepcopy(step6_json)
|
99 |
+
|
100 |
+
# STEP 7
|
101 |
+
print("Step7: Checking claims against evidence")
|
102 |
+
step7 = CheckClaimAgainstEvidence(model=model)
|
103 |
+
step7_json = step7.check_claims_against_evidence(step6_json)
|
104 |
+
results["step7_evaluated_claims"] = copy.deepcopy(step7_json)
|
105 |
+
|
106 |
+
# STEP 8
|
107 |
+
print("Step8: Formatting")
|
108 |
+
if mode == "slow":
|
109 |
+
step8 = FormatDocument(model=model, footnote_style="verbose")
|
110 |
+
step8_md = step8.reformat_document_to_include_claims(
|
111 |
+
text_input, step7_json, footnote_style="verbose"
|
112 |
+
)
|
113 |
+
step8_md_terse = step8.reformat_document_to_include_claims(
|
114 |
+
text_input, step7_json, footnote_style="terse"
|
115 |
+
)
|
116 |
+
|
117 |
+
results["fact_checked_md"] = copy.deepcopy(step8_md)
|
118 |
+
results["fact_checked_terse"] = copy.deepcopy(step8_md_terse)
|
119 |
+
return results
|
120 |
+
|
121 |
+
|
122 |
+
def main(args):
|
123 |
+
with open(args.file, "r") as f:
|
124 |
+
text = f.read()
|
125 |
+
out = get_fact_checked(text, mode="slow", model=args.model)
|
126 |
+
print(out["fact_checked_md"])
|
127 |
+
|
128 |
+
|
129 |
+
if __name__ == "__main__":
|
130 |
+
parser = argparse.ArgumentParser(description="Process a file.")
|
131 |
+
parser.add_argument("--file", type=str, help="File to process", required=True)
|
132 |
+
parser.add_argument("--model", type=str, help="Model to use", required=True)
|
133 |
+
args = parser.parse_args()
|
134 |
+
main(args)
|
step1_api_claim_extractor.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import multiprocessing as mp
|
4 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
5 |
+
from zsvision.zs_utils import BlockTimer
|
6 |
+
from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim
|
7 |
+
from llm_api_utils import (
|
8 |
+
call_openai_with_exponetial_backoff,
|
9 |
+
estimate_cost_of_text_generation_api_call,
|
10 |
+
init_openai_with_api_key,
|
11 |
+
)
|
12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
+
|
14 |
+
|
15 |
+
class ClaimExtractor:
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
temperature=0,
|
19 |
+
model="gpt-3.5-turbo",
|
20 |
+
filter_str="",
|
21 |
+
processes=1,
|
22 |
+
refresh=False,
|
23 |
+
):
|
24 |
+
"""Initializes ClaimExtractor with the provided arguments"""
|
25 |
+
self.temperature = temperature
|
26 |
+
self.model = model
|
27 |
+
self.filter_str = filter_str
|
28 |
+
self.processes = processes
|
29 |
+
self.refresh = refresh
|
30 |
+
|
31 |
+
def extract_claims_from_passage(
|
32 |
+
self,
|
33 |
+
idx: int,
|
34 |
+
total: int,
|
35 |
+
passage: str,
|
36 |
+
):
|
37 |
+
init_openai_with_api_key()
|
38 |
+
print(f"Processing passage {idx + 1} of {total}")
|
39 |
+
prompt = f"""\
|
40 |
+
Task:
|
41 |
+
Enumerate all the discrete factual claims or logical assertions stated in the passage that follows the dashed horizontal line below. \
|
42 |
+
To allow the claims to be linked to the passage, use the format: `VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: <verbatim passage quote for claim>, CLAIM: <claim>` on each line. \
|
43 |
+
The <verbatim passage quote for claim> must be A SINGLE UNEDITED SUBSTRING from the passage that uniquely identifies the claim. \
|
44 |
+
The <verbatim passage quote for claim> must carefully preserve all punctuation and clauses from the original passage. \
|
45 |
+
This text will be used in the final national exam.
|
46 |
+
|
47 |
+
----------
|
48 |
+
Here is an example passage, together with the verbatim passage quotes and claims that should be extracted from it:
|
49 |
+
|
50 |
+
Passage:
|
51 |
+
Immanuel Kant was born in 1724 into a modest, devoutly religious family, with his father working as a saddle-maker. \
|
52 |
+
He was one of nine children, but only five, including Kant, survived to adulthood. \
|
53 |
+
His upbringing was steeped in the Pietist tradition, emphasizing intense religious devotion, a literal interpretation of the Bible, and a strong focus on personal morality. \
|
54 |
+
Kant attended the University of Königsberg, studying various subjects, including theology, metaphysics, and natural science. \
|
55 |
+
After completing his studies, Kant worked as a private tutor for nine years before returning to the University of Königsberg as a lecturer in 1755. \
|
56 |
+
In his works Groundwork of the Metaphysics of Morals (1785) and Critique of Practical Reason (1788), Kant argues that morality is not contingent upon personal desires or cultural norms. \
|
57 |
+
|
58 |
+
|
59 |
+
Extracted source phrases and claims:
|
60 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born in 1724.
|
61 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a modest family.
|
62 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a devoutly religious family.
|
63 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] with his father working as a saddle-maker [CLAIM] Immnauel Kant's father worked as a saddle-maker.
|
64 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] He was one of nine children [CLAIM] Immanuel Kant was one of nine children.
|
65 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] but only five, including Kant survived to adulthood [CLAIM] Only five of Immanuel Kant's parents' children survived to adulthood.
|
66 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] His upbringing was steeped in the Pietist tradition [CLAIM] Immanuel Kant's upbringing was steeped in the Pietist tradition.
|
67 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] emphasizing intense religious devotion [CLAIM] Immanuel Kant's upbringing emphasized intense religious devotion.
|
68 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a literal interpretation of the Bible [CLAIM] Immanuel Kant's upbringing emphasized a literal interpretation of the Bible.
|
69 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a strong focus on personal morality [CLAIM] Immanuel Kant's upbringing emphasized a strong focus on personal morality.
|
70 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Kant attended the University of Königsberg [CLAIM] Immanuel Kant attended the University of Königsberg.
|
71 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied theology.
|
72 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied metaphysics.
|
73 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied natural science.
|
74 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies [CLAIM] Immanuel Kant completed his studies.
|
75 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies, Kant worked as a private tutor for nine years [CLAIM] After completing his studies, Immanuel Kant worked as a private tutor.
|
76 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] before returning to the University of Königsberg as a lecturer in 1755 [CLAIM] Immanuel Kant returned to the University of Königsberg as a lecturer in 1755.
|
77 |
+
|
78 |
+
----------
|
79 |
+
Passage:
|
80 |
+
{passage}
|
81 |
+
|
82 |
+
Extracted source phrases and claims:\
|
83 |
+
"""
|
84 |
+
persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
|
85 |
+
system_message = {"role": "system", "content": persona}
|
86 |
+
user_message = {"role": "user", "content": prompt}
|
87 |
+
messages = [system_message, user_message]
|
88 |
+
|
89 |
+
with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
|
90 |
+
response = call_openai_with_exponetial_backoff(
|
91 |
+
model=self.model,
|
92 |
+
temperature=self.temperature,
|
93 |
+
messages=messages,
|
94 |
+
)
|
95 |
+
|
96 |
+
cost = estimate_cost_of_text_generation_api_call(
|
97 |
+
model=self.model, response=response, verbose=True
|
98 |
+
)
|
99 |
+
content = response.choices[0].message.content
|
100 |
+
content = content.strip()
|
101 |
+
quotes_and_claims = content.split("\n")
|
102 |
+
|
103 |
+
parsed_claims = []
|
104 |
+
for quote_and_claim in quotes_and_claims:
|
105 |
+
quote_and_claim = quote_and_claim.strip()
|
106 |
+
if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim:
|
107 |
+
quote_and_claim = quote_and_claim.replace(
|
108 |
+
"VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: ",
|
109 |
+
"[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]: ",
|
110 |
+
)
|
111 |
+
if "[CLAIM]" not in quote_and_claim:
|
112 |
+
quote_and_claim = quote_and_claim.replace(" CLAIM:", " [CLAIM]:")
|
113 |
+
|
114 |
+
if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim:
|
115 |
+
continue
|
116 |
+
quote_and_claim = quote_and_claim.strip()
|
117 |
+
parsed = parse_passage_quote_and_claim(quote_and_claim)
|
118 |
+
is_unique_and_verbatim = is_unique_verbatim_quote(
|
119 |
+
verbatim_quote=parsed["verbatim_quote"], original_passage=passage
|
120 |
+
)
|
121 |
+
parsed["is_unique_and_verbatim"] = is_unique_and_verbatim
|
122 |
+
parsed_claims.append(parsed)
|
123 |
+
|
124 |
+
return {"claims": parsed_claims, "cost": cost}
|
125 |
+
|
126 |
+
def extract_claims(self, text_input):
|
127 |
+
"""
|
128 |
+
Extracts claims from text_input and return the extracted claims in a json file
|
129 |
+
"""
|
130 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
131 |
+
docs = text_splitter.create_documents([text_input])
|
132 |
+
print(f"Split text into {len(docs)} documents")
|
133 |
+
all_claims = []
|
134 |
+
|
135 |
+
kwarg_list = []
|
136 |
+
for idx, doc in enumerate(docs):
|
137 |
+
# remove newlines from the passage to avoid a confusing prompt format
|
138 |
+
passage = doc.page_content.replace("\n", " ")
|
139 |
+
kwarg_list.append(
|
140 |
+
{
|
141 |
+
"idx": idx,
|
142 |
+
"total": len(docs),
|
143 |
+
"passage": passage,
|
144 |
+
}
|
145 |
+
)
|
146 |
+
|
147 |
+
if self.processes == 1:
|
148 |
+
results = []
|
149 |
+
for kwargs in kwarg_list:
|
150 |
+
results.append(self.extract_claims_from_passage(**kwargs))
|
151 |
+
else: # multiprocess
|
152 |
+
func = self.extract_claims_from_passage
|
153 |
+
with mp.Pool(processes=self.processes) as pool:
|
154 |
+
results = starmap_with_kwargs(
|
155 |
+
pool=pool, func=func, kwargs_iter=kwarg_list
|
156 |
+
)
|
157 |
+
|
158 |
+
cost = sum([result["cost"] for result in results])
|
159 |
+
all_claims = []
|
160 |
+
for result in results:
|
161 |
+
all_claims.extend(result["claims"])
|
162 |
+
|
163 |
+
print(f"Returning {len(all_claims)} claims (cost: {cost} USD)")
|
164 |
+
return all_claims
|
step2_api_fix_passage_anchors.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import multiprocessing as mp
|
4 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
5 |
+
from zsvision.zs_utils import BlockTimer
|
6 |
+
from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim
|
7 |
+
from llm_api_utils import (
|
8 |
+
call_openai_with_exponetial_backoff,
|
9 |
+
estimate_cost_of_text_generation_api_call,
|
10 |
+
init_openai_with_api_key,
|
11 |
+
)
|
12 |
+
|
13 |
+
|
14 |
+
class FixAnchors:
|
15 |
+
def __init__(
|
16 |
+
self,
|
17 |
+
temperature=0,
|
18 |
+
model="gpt-3.5-turbo",
|
19 |
+
filter_str="",
|
20 |
+
processes=8,
|
21 |
+
refresh=False,
|
22 |
+
):
|
23 |
+
self.temperature = temperature
|
24 |
+
self.model = model
|
25 |
+
self.filter_str = filter_str
|
26 |
+
self.processes = processes
|
27 |
+
self.refresh = refresh
|
28 |
+
|
29 |
+
def fix_passage_anchor(
|
30 |
+
self,
|
31 |
+
idx: int,
|
32 |
+
total: int,
|
33 |
+
original_passage: str,
|
34 |
+
claim_with_metadata: dict,
|
35 |
+
):
|
36 |
+
init_openai_with_api_key()
|
37 |
+
print(f"Processing claim with metadata {idx + 1} of {total}")
|
38 |
+
# we remove newlines
|
39 |
+
original_passage = original_passage.replace("\n", " ")
|
40 |
+
assert not claim_with_metadata[
|
41 |
+
"is_unique_and_verbatim"
|
42 |
+
], "We should only fix broken passage anchors"
|
43 |
+
|
44 |
+
prompt = f"""\
|
45 |
+
Task:
|
46 |
+
A co-worker was tasked with identifying a unique, verbatim quote from a passage that underpins a particular claim. \
|
47 |
+
Unfortunately, they made a mistake and the quote they identified is not unique and verbatim. \
|
48 |
+
Your task is to fix their quote so that it is both verbatim and unique.
|
49 |
+
|
50 |
+
-----
|
51 |
+
Here is an example passage, together with the claim and the erroneous quote.
|
52 |
+
|
53 |
+
Passage:
|
54 |
+
In 1940, she was interned in a French camp as an enemy alien, but managed to escape and eventually make her way to the United States in 1941. \
|
55 |
+
Arendt's experiences during this time would deeply influence her work on totalitarianism and human rights. \
|
56 |
+
In New York, she began to immerse herself in academic life, working as an editor, journalist, and lecturer. \
|
57 |
+
Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy. \
|
58 |
+
## A Life Of Controversial, Influential Works \
|
59 |
+
Throughout her career, Arendt wrote a number of seminal, and controversial, works. *The Human Condition* (1958) examined the role of politics in modern societies and introduced the concept of "the public realm" – the space where individuals act and participate in political life. \
|
60 |
+
This exploration of freedom and action would become a recurring theme in her writings. \
|
61 |
+
Her 1963 publication, *Eichmann in Jerusalem: A Report on the Banality of Evil*, based on her coverage of Adolf Eichmann's trial, ignited significant controversy. \
|
62 |
+
Arendt argued that Eichmann, a key architect of the Holocaust, was not a monster but rather an ordinary bureaucrat who unquestioningly followed orders. \
|
63 |
+
The idea of the "banality of evil" continues to influence discussions on the nature of evil and moral responsibility. \
|
64 |
+
Arendt's later works, such as *On Revolution* (1963) and *Between Past and Future* (1968), sought to further unravel the complexities of power, authority, and rebellion. \
|
65 |
+
Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work). \
|
66 |
+
|
67 |
+
Claim:
|
68 |
+
*The Origins of Totalitarianism* established Arendt as a significant voice in political philosophy.
|
69 |
+
|
70 |
+
Initial attempt at a unique and verbatim quote:
|
71 |
+
[The Origins of Totalitarianism] established her as a significant voice in political philosophy.
|
72 |
+
|
73 |
+
Correct (unique and verbatim) quote:
|
74 |
+
Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy.
|
75 |
+
-----
|
76 |
+
Passage:
|
77 |
+
{original_passage}
|
78 |
+
|
79 |
+
Claim:
|
80 |
+
{claim_with_metadata["claim"]}
|
81 |
+
|
82 |
+
Initial attempt at a unique verbatim quote:
|
83 |
+
{claim_with_metadata["verbatim_quote"]}
|
84 |
+
|
85 |
+
Correct (unique and verbatim) quote:\
|
86 |
+
"""
|
87 |
+
persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
|
88 |
+
system_message = {"role": "system", "content": persona}
|
89 |
+
user_message = {"role": "user", "content": prompt}
|
90 |
+
messages = [system_message, user_message]
|
91 |
+
|
92 |
+
with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
|
93 |
+
response = call_openai_with_exponetial_backoff(
|
94 |
+
model=self.model,
|
95 |
+
temperature=self.temperature,
|
96 |
+
messages=messages,
|
97 |
+
)
|
98 |
+
|
99 |
+
cost = estimate_cost_of_text_generation_api_call(
|
100 |
+
model=self.model, response=response, verbose=True
|
101 |
+
)
|
102 |
+
content = response.choices[0].message.content
|
103 |
+
verbatim_quote = content.rstrip()
|
104 |
+
is_unique_and_verbatim = is_unique_verbatim_quote(
|
105 |
+
verbatim_quote=verbatim_quote, original_passage=original_passage
|
106 |
+
)
|
107 |
+
assert (
|
108 |
+
is_unique_and_verbatim
|
109 |
+
), f"Failed to fix passage anchor: {claim_with_metadata['verbatim_quote']} was updated to {verbatim_quote} but is not unique and verbatim"
|
110 |
+
|
111 |
+
claim_with_metadata["verbatim_quote"] = verbatim_quote
|
112 |
+
return {"claim_with_metadata": claim_with_metadata, "cost": cost}
|
113 |
+
|
114 |
+
def fix_passage_anchors(self, claims_with_metadata, original_passage: str):
|
115 |
+
kwarg_list = []
|
116 |
+
valid_claims_with_metadata = []
|
117 |
+
invalid_claims_with_metadata = []
|
118 |
+
for idx, claim_with_metadata in enumerate(claims_with_metadata):
|
119 |
+
# remove newlines from the passage to avoid a confusing prompt format
|
120 |
+
if not claim_with_metadata["is_unique_and_verbatim"]:
|
121 |
+
invalid_claims_with_metadata.append(claim_with_metadata)
|
122 |
+
else:
|
123 |
+
valid_claims_with_metadata.append(claim_with_metadata)
|
124 |
+
|
125 |
+
for idx, claim_with_metadata in enumerate(invalid_claims_with_metadata):
|
126 |
+
kwarg_list.append(
|
127 |
+
{
|
128 |
+
"idx": idx,
|
129 |
+
"total": len(invalid_claims_with_metadata),
|
130 |
+
"claim_with_metadata": claim_with_metadata,
|
131 |
+
"original_passage": original_passage,
|
132 |
+
}
|
133 |
+
)
|
134 |
+
|
135 |
+
if self.processes == 1:
|
136 |
+
results = []
|
137 |
+
for kwargs in kwarg_list:
|
138 |
+
try:
|
139 |
+
results.append(self.fix_passage_anchor(**kwargs))
|
140 |
+
except Exception as e:
|
141 |
+
print(f"Exception in step2: {e}, model: {self.model}")
|
142 |
+
print("Skipping this claim!")
|
143 |
+
if self.model == "gpt-4":
|
144 |
+
pass
|
145 |
+
else:
|
146 |
+
raise e
|
147 |
+
else: # multiprocess
|
148 |
+
func = self.fix_passage_anchor
|
149 |
+
with mp.Pool(processes=self.processes) as pool:
|
150 |
+
results = starmap_with_kwargs(
|
151 |
+
pool=pool, func=func, kwargs_iter=kwarg_list
|
152 |
+
)
|
153 |
+
|
154 |
+
cost = sum([result["cost"] for result in results])
|
155 |
+
for result in results:
|
156 |
+
valid_claims_with_metadata.append(result["claim_with_metadata"])
|
157 |
+
|
158 |
+
# remove the is_unique_and_verbatim field (no longer needed)
|
159 |
+
for claim_with_metadata in valid_claims_with_metadata:
|
160 |
+
del claim_with_metadata["is_unique_and_verbatim"]
|
161 |
+
|
162 |
+
print(
|
163 |
+
f"Returning {len(valid_claims_with_metadata)} claims with metadat (cost: {cost} USD)"
|
164 |
+
)
|
165 |
+
return valid_claims_with_metadata
|
step3_api_identify_objective_claims.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import argparse
|
3 |
+
import multiprocessing as mp
|
4 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
5 |
+
from typing import List, Dict
|
6 |
+
import numpy as np
|
7 |
+
from zsvision.zs_utils import BlockTimer
|
8 |
+
from llm_api_utils import (
|
9 |
+
call_openai_with_exponetial_backoff,
|
10 |
+
estimate_cost_of_text_generation_api_call,
|
11 |
+
init_openai_with_api_key,
|
12 |
+
)
|
13 |
+
import random
|
14 |
+
|
15 |
+
|
16 |
+
class ClassifyClaims:
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
temperature=0,
|
20 |
+
model="gpt-3.5-turbo",
|
21 |
+
max_claims_per_api_call=10,
|
22 |
+
processes=8,
|
23 |
+
filter_str="",
|
24 |
+
refresh=False,
|
25 |
+
):
|
26 |
+
self.temperature = temperature
|
27 |
+
self.model = model
|
28 |
+
self.max_claims_per_api_call = max_claims_per_api_call
|
29 |
+
self.processes = processes
|
30 |
+
self.filter_str = filter_str
|
31 |
+
self.refresh = refresh
|
32 |
+
self.objective_claims_file = "objective_claims.txt"
|
33 |
+
self.subjective_claims_file = "subjective_claims.txt"
|
34 |
+
|
35 |
+
def parse_classification_label(self, text: str) -> str:
|
36 |
+
raw = text.strip()
|
37 |
+
if raw.endswith("[objective]"):
|
38 |
+
label = "objective"
|
39 |
+
elif raw.endswith("[subjective]"):
|
40 |
+
label = "subjective"
|
41 |
+
else:
|
42 |
+
raise ValueError(f"Invalid label: {raw}")
|
43 |
+
return label
|
44 |
+
|
45 |
+
def read_file(self, file_name):
|
46 |
+
with open(file_name, "r") as f:
|
47 |
+
lines = []
|
48 |
+
for line in f:
|
49 |
+
parsed_line = line.strip()
|
50 |
+
lines.append(parsed_line)
|
51 |
+
return lines
|
52 |
+
|
53 |
+
def create_few_shot_learning_prompt(self) -> str:
|
54 |
+
objective_list = self.read_file(self.objective_claims_file)
|
55 |
+
subjective_list = self.read_file(self.subjective_claims_file)
|
56 |
+
merged_list = list(
|
57 |
+
zip(objective_list, ["[objective]"] * len(objective_list))
|
58 |
+
) + list(zip(subjective_list, ["[subjective]"] * len(subjective_list)))
|
59 |
+
|
60 |
+
# Randomizing the merged list with a specific seed
|
61 |
+
seed = 1234
|
62 |
+
random.seed(seed)
|
63 |
+
random.shuffle(merged_list)
|
64 |
+
prompt = "Claims:\n"
|
65 |
+
for claim, _ in merged_list:
|
66 |
+
prompt += claim + "\n"
|
67 |
+
prompt += "\nClassifications:\n"
|
68 |
+
for claim, classif in merged_list:
|
69 |
+
prompt += claim + " " + classif + "\n"
|
70 |
+
return prompt
|
71 |
+
|
72 |
+
def classify_claim_batch(
|
73 |
+
self,
|
74 |
+
idx: int,
|
75 |
+
total: int,
|
76 |
+
claims_and_sources_batch: List[Dict[str, str]],
|
77 |
+
):
|
78 |
+
print(
|
79 |
+
f"Processing batch {idx+1} of {total} (containing {len(claims_and_sources_batch)} claims)"
|
80 |
+
)
|
81 |
+
|
82 |
+
claim_str = "\n".join([claim["claim"] for claim in claims_and_sources_batch])
|
83 |
+
num_batch_claims = len(claims_and_sources_batch)
|
84 |
+
few_shot = self.create_few_shot_learning_prompt()
|
85 |
+
prompt = f"""\
|
86 |
+
Objective claims can be verified based on factual data (such as those that could be verified by \
|
87 |
+
referencing an encyclopedia), whereas subjective claims involve a personal interpretation of \
|
88 |
+
the data and are more open to debate. \
|
89 |
+
For each of the following claims given below the dashed horizontal line, classify them as \
|
90 |
+
[subjective] or [objective] by suffixing the claim with the appropriate label. OUTPUT ONLY the class, either subjective or objective for each claim!
|
91 |
+
|
92 |
+
Here are some examples:
|
93 |
+
|
94 |
+
{few_shot}
|
95 |
+
----------
|
96 |
+
Claims:
|
97 |
+
{claim_str}
|
98 |
+
|
99 |
+
Classifications:\
|
100 |
+
"""
|
101 |
+
persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
|
102 |
+
system_message = {"role": "system", "content": persona}
|
103 |
+
user_message = {"role": "user", "content": prompt}
|
104 |
+
messages = [system_message, user_message]
|
105 |
+
|
106 |
+
with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
|
107 |
+
response = call_openai_with_exponetial_backoff(
|
108 |
+
model=self.model,
|
109 |
+
temperature=self.temperature,
|
110 |
+
messages=messages,
|
111 |
+
)
|
112 |
+
|
113 |
+
cost = estimate_cost_of_text_generation_api_call(
|
114 |
+
model=self.model, response=response, verbose=True
|
115 |
+
)
|
116 |
+
|
117 |
+
proposed_classified_claims = response.choices[0].message.content
|
118 |
+
batch_classified_claims = proposed_classified_claims.split("\n")
|
119 |
+
|
120 |
+
content = response.choices[0].message.content
|
121 |
+
batch_classified_claims = content.split("\n")
|
122 |
+
assert (
|
123 |
+
len(batch_classified_claims) == num_batch_claims
|
124 |
+
), f"Expected {num_batch_claims} claims, but got {len(batch_classified_claims)}"
|
125 |
+
print(f"Generated {len(batch_classified_claims)} claims (cost: {cost:.4f} USD)")
|
126 |
+
|
127 |
+
claims_with_labels = []
|
128 |
+
for claim_and_source, classified_claim in zip(
|
129 |
+
claims_and_sources_batch, batch_classified_claims
|
130 |
+
):
|
131 |
+
claim_label = self.parse_classification_label(classified_claim)
|
132 |
+
claim_and_source["label"] = claim_label
|
133 |
+
claims_with_labels.append(claim_and_source)
|
134 |
+
return {"claims_with_labels": claims_with_labels, "cost": cost}
|
135 |
+
|
136 |
+
def classify_claims(self, claims_and_sources):
|
137 |
+
"""
|
138 |
+
Classify claims as being either subjective or objective, and write the results to a file.
|
139 |
+
"""
|
140 |
+
init_openai_with_api_key()
|
141 |
+
num_claims = len(claims_and_sources)
|
142 |
+
|
143 |
+
# we limit the number of claims per api call (otherwise GPT-4 can choke)
|
144 |
+
num_batches = int(np.ceil(num_claims / self.max_claims_per_api_call))
|
145 |
+
claims_and_sources_batches = [
|
146 |
+
batch.tolist() for batch in np.array_split(claims_and_sources, num_batches)
|
147 |
+
]
|
148 |
+
|
149 |
+
kwarg_list = []
|
150 |
+
for idx, claims_and_sources_batch in enumerate(claims_and_sources_batches):
|
151 |
+
# remove newlines from the passage to avoid a confusing prompt format
|
152 |
+
kwarg_list.append(
|
153 |
+
{
|
154 |
+
"idx": idx,
|
155 |
+
"total": len(claims_and_sources_batches),
|
156 |
+
"claims_and_sources_batch": claims_and_sources_batch,
|
157 |
+
}
|
158 |
+
)
|
159 |
+
|
160 |
+
if self.processes == 1:
|
161 |
+
batch_results = []
|
162 |
+
for kwargs in kwarg_list:
|
163 |
+
batch_results.append(self.classify_claim_batch(**kwargs))
|
164 |
+
else: # multiprocess
|
165 |
+
func = self.classify_claim_batch
|
166 |
+
with mp.Pool(processes=self.processes) as pool:
|
167 |
+
batch_results = starmap_with_kwargs(
|
168 |
+
pool=pool, func=func, kwargs_iter=kwarg_list
|
169 |
+
)
|
170 |
+
|
171 |
+
cost = sum([result["cost"] for result in batch_results])
|
172 |
+
labelled_claims = []
|
173 |
+
for batch in batch_results:
|
174 |
+
labelled_claims.extend(batch["claims_with_labels"])
|
175 |
+
|
176 |
+
print(f"Returning {len(labelled_claims)} claims (cost: {cost} USD)")
|
177 |
+
return labelled_claims
|
178 |
+
|
179 |
+
def filter_to_objective_claims(self, claims):
|
180 |
+
"""Filter claims to only those that are objective."""
|
181 |
+
|
182 |
+
objective_claims = [claim for claim in claims if claim["label"] == "objective"]
|
183 |
+
|
184 |
+
print(f"Returning {len(objective_claims)} objective claims")
|
185 |
+
return objective_claims
|
step41_api_fetch_cohere_wikipedia_evidence.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import multiprocessing as mp
|
4 |
+
from datetime import datetime
|
5 |
+
import time
|
6 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
7 |
+
import weaviate
|
8 |
+
import os
|
9 |
+
|
10 |
+
|
11 |
+
class CohereEvidence:
|
12 |
+
def __init__(self, processes=8, filter_str="", refresh=False):
|
13 |
+
self.processes = processes
|
14 |
+
self.filter_str = filter_str
|
15 |
+
self.refresh = refresh
|
16 |
+
|
17 |
+
def semantic_search(self, query, client, results_lang=""):
|
18 |
+
"""
|
19 |
+
Query the vectors database and return the top results.
|
20 |
+
"""
|
21 |
+
|
22 |
+
nearText = {"concepts": [query]}
|
23 |
+
properties = ["text", "title", "url", "views", "lang", "_additional {distance}"]
|
24 |
+
|
25 |
+
# To filter by language
|
26 |
+
if results_lang != "":
|
27 |
+
where_filter = {
|
28 |
+
"path": ["lang"],
|
29 |
+
"operator": "Equal",
|
30 |
+
"valueString": results_lang,
|
31 |
+
}
|
32 |
+
response = (
|
33 |
+
client.query.get("Articles", properties)
|
34 |
+
.with_where(where_filter)
|
35 |
+
.with_near_text(nearText)
|
36 |
+
.with_limit(5)
|
37 |
+
.do()
|
38 |
+
)
|
39 |
+
|
40 |
+
# Search all languages
|
41 |
+
else:
|
42 |
+
response = (
|
43 |
+
client.query.get("Articles", properties)
|
44 |
+
.with_near_text(nearText)
|
45 |
+
.with_limit(5)
|
46 |
+
.do()
|
47 |
+
)
|
48 |
+
|
49 |
+
result = response["data"]["Get"]["Articles"]
|
50 |
+
|
51 |
+
return result
|
52 |
+
|
53 |
+
def fetch_cohere_semantic_search_results_to_gather_evidence(
|
54 |
+
self,
|
55 |
+
queryset: dict,
|
56 |
+
):
|
57 |
+
"""
|
58 |
+
Generate a search query that can be used to verify a claim.
|
59 |
+
"""
|
60 |
+
# 10M wiki embeddings (1M in English)
|
61 |
+
weaviate_api_key = os.environ.get("WEAVIATE_API_KEY")
|
62 |
+
|
63 |
+
cohere_api_key = os.environ.get("COHERE_API_KEY")
|
64 |
+
|
65 |
+
client = weaviate.Client(
|
66 |
+
url="https://cohere-demo.weaviate.network/",
|
67 |
+
auth_client_secret=weaviate.auth.AuthApiKey(
|
68 |
+
api_key=weaviate_api_key
|
69 |
+
), # Replace w/ your Weaviate instance API key
|
70 |
+
additional_headers={
|
71 |
+
"X-Cohere-Api-Key": cohere_api_key # Replace with your inference API key
|
72 |
+
},
|
73 |
+
)
|
74 |
+
|
75 |
+
while not client.is_ready():
|
76 |
+
print(f"Waiting for client to be ready")
|
77 |
+
time.sleep(1)
|
78 |
+
|
79 |
+
for item in queryset:
|
80 |
+
results = self.semantic_search(
|
81 |
+
item["claim"], client=client, results_lang="en"
|
82 |
+
)
|
83 |
+
# rename "url" to "link" to be consistent with google results
|
84 |
+
reformatted_results = []
|
85 |
+
for result in results:
|
86 |
+
result["link"] = result.pop("url")
|
87 |
+
reformatted_results.append(result)
|
88 |
+
item["search_results"] = reformatted_results
|
89 |
+
|
90 |
+
# update the queryset with new information
|
91 |
+
date_str = datetime.now().strftime("%Y-%m-%d")
|
92 |
+
results = {
|
93 |
+
"documents": queryset,
|
94 |
+
"dates": {"results_fetched_from_wikipedia_1M_with_cohere-22-12": date_str},
|
95 |
+
}
|
96 |
+
print(f"Returning Cohere Wikipedia paragraph for {len(queryset)} queries")
|
97 |
+
return results
|
step42_api_fetch_google_search_evidence.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from zsvision.zs_utils import BlockTimer
|
4 |
+
import json
|
5 |
+
import json5
|
6 |
+
import argparse
|
7 |
+
import multiprocessing as mp
|
8 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
9 |
+
from datetime import datetime
|
10 |
+
import urllib.robotparser
|
11 |
+
import urllib.parse
|
12 |
+
from urllib.parse import urlunparse
|
13 |
+
from utils import get_google_search_results
|
14 |
+
|
15 |
+
import time
|
16 |
+
from random import randint
|
17 |
+
from fake_useragent import UserAgent
|
18 |
+
from newspaper import Article, Config
|
19 |
+
|
20 |
+
|
21 |
+
class GoogleEvidence:
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
model="gpt-3.5-turbo",
|
25 |
+
limit=0,
|
26 |
+
refresh=False,
|
27 |
+
num_search_results_to_keep=3,
|
28 |
+
filter_str="",
|
29 |
+
processes=8,
|
30 |
+
):
|
31 |
+
self.model = model
|
32 |
+
self.limit = limit
|
33 |
+
self.refresh = refresh
|
34 |
+
self.num_search_results_to_keep = num_search_results_to_keep
|
35 |
+
self.filter_str = filter_str
|
36 |
+
self.processes = processes
|
37 |
+
|
38 |
+
def can_index(self, url, user_agent_name):
|
39 |
+
rp = urllib.robotparser.RobotFileParser()
|
40 |
+
robots_url = f"{url.scheme}://{url.netloc}/robots.txt"
|
41 |
+
|
42 |
+
headers = {
|
43 |
+
"User-Agent": user_agent_name,
|
44 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
45 |
+
"Accept-Language": "en-US,en;q=0.5",
|
46 |
+
"DNT": "1",
|
47 |
+
"Connection": "keep-alive",
|
48 |
+
"Upgrade-Insecure-Requests": "1",
|
49 |
+
}
|
50 |
+
|
51 |
+
try:
|
52 |
+
req = urllib.request.Request(robots_url, headers=headers)
|
53 |
+
with urllib.request.urlopen(req) as response:
|
54 |
+
rp.parse(response.read().decode("utf-8").splitlines())
|
55 |
+
|
56 |
+
ok_to_index = rp.can_fetch(user_agent_name, url.geturl())
|
57 |
+
except urllib.error.URLError:
|
58 |
+
# If there is no robots.txt or there is an error accessing it, assume it's okay to index
|
59 |
+
ok_to_index = True
|
60 |
+
except Exception as e:
|
61 |
+
print(f"An unexpected error occurred in step42: {e}")
|
62 |
+
# going the safe route
|
63 |
+
ok_to_index = False
|
64 |
+
return ok_to_index
|
65 |
+
|
66 |
+
def fetch_search_results_to_gather_evidence(
|
67 |
+
self,
|
68 |
+
queryset: dict,
|
69 |
+
):
|
70 |
+
user_agent = UserAgent()
|
71 |
+
config = Config()
|
72 |
+
config.fetch_images = False
|
73 |
+
|
74 |
+
user_agent_name = "FiltirBot/1.0 (+https://filtir.com/filtirbot-info)"
|
75 |
+
|
76 |
+
headers = {
|
77 |
+
"User-Agent": user_agent_name,
|
78 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
79 |
+
"Accept-Language": "en-US,en;q=0.5",
|
80 |
+
"DNT": "1",
|
81 |
+
"Connection": "keep-alive",
|
82 |
+
"Upgrade-Insecure-Requests": "1",
|
83 |
+
}
|
84 |
+
|
85 |
+
# we assume some sites won't permit indexing, so we'll skip these
|
86 |
+
num_results = self.num_search_results_to_keep + 5
|
87 |
+
results = {}
|
88 |
+
|
89 |
+
print(f"Found {len(queryset)} claims to fetch search results for")
|
90 |
+
|
91 |
+
for queryset_idx, item in enumerate(queryset):
|
92 |
+
with BlockTimer(
|
93 |
+
f"Fetching search results from Google {queryset_idx + 1}/{len(queryset)}"
|
94 |
+
):
|
95 |
+
search_results = get_google_search_results(
|
96 |
+
query_str=item["claim"], num_results=num_results
|
97 |
+
)
|
98 |
+
|
99 |
+
if search_results == [{"Result": "No good Google Search Result was found"}]:
|
100 |
+
item["search_results"] = []
|
101 |
+
continue
|
102 |
+
|
103 |
+
parsed_results = []
|
104 |
+
for search_result in search_results:
|
105 |
+
if not self.can_index(
|
106 |
+
urllib.parse.urlparse(search_result["link"]),
|
107 |
+
user_agent_name=user_agent_name,
|
108 |
+
):
|
109 |
+
print(
|
110 |
+
f"Skipping {search_result['link']} because it doesn't permit indexing"
|
111 |
+
)
|
112 |
+
continue
|
113 |
+
try:
|
114 |
+
config.browser_user_agent = user_agent.random
|
115 |
+
article = Article(
|
116 |
+
search_result["link"], language="en", config=config
|
117 |
+
)
|
118 |
+
article.download()
|
119 |
+
article.parse()
|
120 |
+
text = article.text
|
121 |
+
except Exception as e:
|
122 |
+
print(f"Error parsing article: {e}, trying with requests.get...")
|
123 |
+
try:
|
124 |
+
response = requests.get(
|
125 |
+
search_result["link"], timeout=15, headers=headers
|
126 |
+
)
|
127 |
+
html = response.text
|
128 |
+
soup = BeautifulSoup(html, features="html.parser")
|
129 |
+
text = soup.get_text()
|
130 |
+
except Exception as exception:
|
131 |
+
print(f"Error parsing article: {exception}, skipping")
|
132 |
+
continue
|
133 |
+
|
134 |
+
search_result["text"] = text
|
135 |
+
parsed_results.append(search_result)
|
136 |
+
if len(parsed_results) == self.num_search_results_to_keep:
|
137 |
+
break
|
138 |
+
item["search_results"] = parsed_results
|
139 |
+
|
140 |
+
# update the queryset with new information
|
141 |
+
date_str = datetime.now().strftime("%Y-%m-%d")
|
142 |
+
results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}
|
143 |
+
|
144 |
+
print(f"Returning web pages for search results for {len(queryset)} queries")
|
145 |
+
return results
|
step5_api_embed_search_results.py
ADDED
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faiss
|
2 |
+
import shutil
|
3 |
+
from beartype import beartype
|
4 |
+
import numpy as np
|
5 |
+
import json
|
6 |
+
import argparse
|
7 |
+
from zsvision.zs_utils import BlockTimer
|
8 |
+
import tiktoken
|
9 |
+
from pathlib import Path
|
10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
+
from pipeline_paths import PIPELINE_PATHS
|
12 |
+
from llm_api_utils import (
|
13 |
+
init_openai_with_api_key,
|
14 |
+
EMBEDDING_DIMENSIONS,
|
15 |
+
PRICE_PER_1K_TOKENS,
|
16 |
+
)
|
17 |
+
from langchain_community.vectorstores import FAISS
|
18 |
+
from langchain_openai import OpenAIEmbeddings
|
19 |
+
from langchain.docstore.in_memory import InMemoryDocstore
|
20 |
+
|
21 |
+
|
22 |
+
class EmbedResults:
|
23 |
+
def __init__(
|
24 |
+
self,
|
25 |
+
embedding_model="ada",
|
26 |
+
limit=0,
|
27 |
+
refresh=False,
|
28 |
+
refresh_faiss_db=False,
|
29 |
+
text_embedding_chunk_size=500,
|
30 |
+
filter_str="",
|
31 |
+
):
|
32 |
+
self.embedding_model = embedding_model
|
33 |
+
self.limit = limit
|
34 |
+
self.refresh = refresh
|
35 |
+
self.refresh_faiss_db = refresh_faiss_db
|
36 |
+
self.text_embedding_chunk_size = text_embedding_chunk_size
|
37 |
+
self.filter_str = filter_str
|
38 |
+
|
39 |
+
@beartype
|
40 |
+
def compute_embeddings_from_chunks(
|
41 |
+
self, embedding_function: OpenAIEmbeddings, metadatas: list, faiss_db
|
42 |
+
):
|
43 |
+
doc_chunks = []
|
44 |
+
metadatas_without_chunks = []
|
45 |
+
for metadata in metadatas:
|
46 |
+
doc_chunk = metadata.pop("doc_chunk")
|
47 |
+
doc_chunks.append(doc_chunk)
|
48 |
+
metadatas_without_chunks.append(metadata)
|
49 |
+
|
50 |
+
with BlockTimer(f"Embedding {len(metadatas)} fragments"):
|
51 |
+
embeddings = embedding_function.embed_documents(doc_chunks)
|
52 |
+
# account for name mangling in Python
|
53 |
+
faiss_db._FAISS__add(doc_chunks, embeddings, metadatas_without_chunks)
|
54 |
+
|
55 |
+
return faiss_db
|
56 |
+
|
57 |
+
@beartype
|
58 |
+
def parse_date_of_fetching(self, data: dict) -> str:
|
59 |
+
evidence_keys = {
|
60 |
+
"search_results_fetched",
|
61 |
+
"results_fetched_from_wikipedia_1M_with_cohere-22-12",
|
62 |
+
}
|
63 |
+
for key in evidence_keys:
|
64 |
+
if key in data["dates"]:
|
65 |
+
evidence_fetched_date = data["dates"][key]
|
66 |
+
return evidence_fetched_date
|
67 |
+
raise ValueError(f"Could not find evidence fetched date in {data['dates']}")
|
68 |
+
|
69 |
+
def embed_for_uuid(self, srcs):
|
70 |
+
init_openai_with_api_key()
|
71 |
+
|
72 |
+
embedding_function = OpenAIEmbeddings()
|
73 |
+
|
74 |
+
index = faiss.IndexFlatL2(EMBEDDING_DIMENSIONS[self.embedding_model])
|
75 |
+
docstore = InMemoryDocstore({})
|
76 |
+
index_to_docstore_id = {}
|
77 |
+
faiss_db = FAISS(
|
78 |
+
embedding_function=embedding_function.embed_query,
|
79 |
+
index=index,
|
80 |
+
docstore=docstore,
|
81 |
+
index_to_docstore_id=index_to_docstore_id,
|
82 |
+
)
|
83 |
+
|
84 |
+
already_embedded_chunks = {
|
85 |
+
doc.metadata["chunk_tag"] for doc in faiss_db.docstore._dict.values()
|
86 |
+
}
|
87 |
+
|
88 |
+
splitter = RecursiveCharacterTextSplitter(
|
89 |
+
chunk_size=self.text_embedding_chunk_size,
|
90 |
+
chunk_overlap=0,
|
91 |
+
)
|
92 |
+
|
93 |
+
kwarg_list = []
|
94 |
+
seen_links = set()
|
95 |
+
metadatas = []
|
96 |
+
total_chunks = 0
|
97 |
+
chunks_to_embed = 0
|
98 |
+
chunks_to_skip = 0
|
99 |
+
|
100 |
+
for data in srcs:
|
101 |
+
evidence_fetched_date = self.parse_date_of_fetching(data)
|
102 |
+
|
103 |
+
for document in data["documents"]:
|
104 |
+
for search_result in document["search_results"]:
|
105 |
+
# Don't embed the same link twice
|
106 |
+
if search_result["link"] in seen_links:
|
107 |
+
continue
|
108 |
+
seen_links.add(search_result["link"])
|
109 |
+
|
110 |
+
doc_chunks = [
|
111 |
+
doc.page_content
|
112 |
+
for doc in splitter.create_documents([search_result["text"]])
|
113 |
+
]
|
114 |
+
chunk_tags = [
|
115 |
+
f"{search_result['link']}-chunk-{idx}-chunk_sz-{self.text_embedding_chunk_size}"
|
116 |
+
for idx in range(len(doc_chunks))
|
117 |
+
]
|
118 |
+
for doc_chunk, chunk_tag in zip(doc_chunks, chunk_tags):
|
119 |
+
if chunk_tag not in already_embedded_chunks:
|
120 |
+
metadatas.append(
|
121 |
+
{
|
122 |
+
"doc_chunk": doc_chunk,
|
123 |
+
"link": search_result["link"],
|
124 |
+
"chunk_tag": chunk_tag,
|
125 |
+
"date_accessed": evidence_fetched_date,
|
126 |
+
"query": document["claim"],
|
127 |
+
}
|
128 |
+
)
|
129 |
+
chunks_to_embed += 1
|
130 |
+
else:
|
131 |
+
chunks_to_skip += 1
|
132 |
+
total_chunks += len(doc_chunks)
|
133 |
+
|
134 |
+
encoding = tiktoken.encoding_for_model(self.embedding_model)
|
135 |
+
doc_chunks = [x["doc_chunk"] for x in metadatas]
|
136 |
+
num_words = len(" ".join(doc_chunks).split())
|
137 |
+
num_tokens = len(encoding.encode("".join(doc_chunks)))
|
138 |
+
|
139 |
+
print(
|
140 |
+
f"Created {total_chunks} chunks of text to answer from {len(seen_links)} websites"
|
141 |
+
)
|
142 |
+
print(
|
143 |
+
f"Embedding {chunks_to_embed} (skipping {chunks_to_skip}) chunks of text from {len(seen_links)} websites)"
|
144 |
+
)
|
145 |
+
print(
|
146 |
+
f"Embedding {num_tokens} tokens ({num_words} words) from {len(doc_chunks)} chunks"
|
147 |
+
)
|
148 |
+
print(
|
149 |
+
f"Step5: Estimated cost: {num_tokens * PRICE_PER_1K_TOKENS[self.embedding_model]['embed'] / 1000:.2f} USD"
|
150 |
+
)
|
151 |
+
|
152 |
+
if metadatas:
|
153 |
+
self.compute_embeddings_from_chunks(
|
154 |
+
embedding_function=embedding_function,
|
155 |
+
faiss_db=faiss_db,
|
156 |
+
metadatas=metadatas,
|
157 |
+
)
|
158 |
+
|
159 |
+
return faiss_db
|
160 |
+
return None
|
161 |
+
|
162 |
+
def embed(self):
|
163 |
+
init_openai_with_api_key()
|
164 |
+
src_paths = []
|
165 |
+
for evidence_key in (
|
166 |
+
"google_search_results_evidence",
|
167 |
+
"cohere_wikipedia_evidence",
|
168 |
+
):
|
169 |
+
evidence_paths = list(PIPELINE_PATHS[evidence_key].glob("**/*.json"))
|
170 |
+
src_paths.extend(evidence_paths)
|
171 |
+
|
172 |
+
if self.filter_str:
|
173 |
+
num_paths = len(src_paths)
|
174 |
+
src_paths = [
|
175 |
+
src_path for src_path in src_paths if self.filter_str in src_path.name
|
176 |
+
]
|
177 |
+
print(
|
178 |
+
f"Filtering for {self.filter_str} (from {num_paths} to {len(src_paths)})"
|
179 |
+
)
|
180 |
+
|
181 |
+
print(f"Found {len(src_paths)} collections of evidence")
|
182 |
+
src_paths = sorted(src_paths)
|
183 |
+
|
184 |
+
embedding_function = OpenAIEmbeddings()
|
185 |
+
faiss_persist_dir = (
|
186 |
+
PIPELINE_PATHS["faiss_db_embeddings_for_evidence"]
|
187 |
+
/ f"{self.embedding_model}_chunk_size_{self.text_embedding_chunk_size}"
|
188 |
+
)
|
189 |
+
|
190 |
+
if faiss_persist_dir.exists():
|
191 |
+
if self.refresh_faiss_db:
|
192 |
+
print(f"Deleting existing database at {faiss_persist_dir}")
|
193 |
+
shutil.rmtree(faiss_persist_dir)
|
194 |
+
|
195 |
+
# check which chunks we've already embedded to avoid duplication
|
196 |
+
if faiss_persist_dir.exists() and not self.refresh_faiss_db:
|
197 |
+
faiss_db = FAISS.load_local(
|
198 |
+
folder_path=str(faiss_persist_dir), embeddings=embedding_function
|
199 |
+
)
|
200 |
+
print(f"Found existing database at {faiss_persist_dir}, using... ")
|
201 |
+
else:
|
202 |
+
index = faiss.IndexFlatL2(EMBEDDING_DIMENSIONS[self.embedding_model])
|
203 |
+
docstore = InMemoryDocstore({})
|
204 |
+
index_to_docstore_id = {}
|
205 |
+
faiss_db = FAISS(
|
206 |
+
embedding_function=embedding_function.embed_query,
|
207 |
+
index=index,
|
208 |
+
docstore=docstore,
|
209 |
+
index_to_docstore_id=index_to_docstore_id,
|
210 |
+
)
|
211 |
+
print(f"Persisting intialised database to {faiss_persist_dir}")
|
212 |
+
faiss_db.save_local(folder_path=str(faiss_persist_dir))
|
213 |
+
|
214 |
+
already_embedded_chunks = {
|
215 |
+
doc.metadata["chunk_tag"] for doc in faiss_db.docstore._dict.values()
|
216 |
+
}
|
217 |
+
|
218 |
+
splitter = RecursiveCharacterTextSplitter(
|
219 |
+
chunk_size=self.text_embedding_chunk_size,
|
220 |
+
chunk_overlap=0,
|
221 |
+
)
|
222 |
+
|
223 |
+
kwarg_list = []
|
224 |
+
seen_links = set()
|
225 |
+
metadatas = []
|
226 |
+
total_chunks = 0
|
227 |
+
chunks_to_embed = 0
|
228 |
+
chunks_to_skip = 0
|
229 |
+
|
230 |
+
for src_path in src_paths:
|
231 |
+
with open(src_path, "r") as f:
|
232 |
+
data = json.load(f)
|
233 |
+
|
234 |
+
evidence_fetched_date = self.parse_date_of_fetching(data)
|
235 |
+
|
236 |
+
for document in data["documents"]:
|
237 |
+
for search_result in document["search_results"]:
|
238 |
+
# Don't embed the same link twice
|
239 |
+
if search_result["link"] in seen_links:
|
240 |
+
continue
|
241 |
+
seen_links.add(search_result["link"])
|
242 |
+
|
243 |
+
doc_chunks = [
|
244 |
+
doc.page_content
|
245 |
+
for doc in splitter.create_documents([search_result["text"]])
|
246 |
+
]
|
247 |
+
chunk_tags = [
|
248 |
+
f"{search_result['link']}-chunk-{idx}-chunk_sz-{self.text_embedding_chunk_size}"
|
249 |
+
for idx in range(len(doc_chunks))
|
250 |
+
]
|
251 |
+
for doc_chunk, chunk_tag in zip(doc_chunks, chunk_tags):
|
252 |
+
if chunk_tag not in already_embedded_chunks:
|
253 |
+
metadatas.append(
|
254 |
+
{
|
255 |
+
"doc_chunk": doc_chunk,
|
256 |
+
"link": search_result["link"],
|
257 |
+
"chunk_tag": chunk_tag,
|
258 |
+
"date_accessed": evidence_fetched_date,
|
259 |
+
"query": document["claim"],
|
260 |
+
}
|
261 |
+
)
|
262 |
+
chunks_to_embed += 1
|
263 |
+
else:
|
264 |
+
chunks_to_skip += 1
|
265 |
+
total_chunks += len(doc_chunks)
|
266 |
+
|
267 |
+
encoding = tiktoken.encoding_for_model(self.embedding_model)
|
268 |
+
doc_chunks = [x["doc_chunk"] for x in metadatas]
|
269 |
+
num_words = len(" ".join(doc_chunks).split())
|
270 |
+
num_tokens = len(encoding.encode("".join(doc_chunks)))
|
271 |
+
|
272 |
+
print(
|
273 |
+
f"Created {total_chunks} chunks of text to answer from {len(seen_links)} websites"
|
274 |
+
)
|
275 |
+
print(
|
276 |
+
f"Embedding {chunks_to_embed} (skipping {chunks_to_skip}) chunks of text from {len(seen_links)} websites)"
|
277 |
+
)
|
278 |
+
print(
|
279 |
+
f"Embedding {num_tokens} tokens ({num_words} words) from {len(doc_chunks)} chunks"
|
280 |
+
)
|
281 |
+
print(
|
282 |
+
f"Estimated cost: {num_tokens * PRICE_PER_1K_TOKENS[self.embedding_model]['embed'] / 1000:.2f} USD"
|
283 |
+
)
|
284 |
+
|
285 |
+
if metadatas:
|
286 |
+
self.compute_embeddings_from_chunks(
|
287 |
+
embedding_function=embedding_function,
|
288 |
+
faiss_persist_dir=faiss_persist_dir,
|
289 |
+
metadatas=metadatas,
|
290 |
+
)
|
step6_api_claims_to_evidence.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import faiss
|
2 |
+
import shutil
|
3 |
+
from beartype import beartype
|
4 |
+
import numpy as np
|
5 |
+
import json
|
6 |
+
import argparse
|
7 |
+
from zsvision.zs_utils import BlockTimer
|
8 |
+
import tiktoken
|
9 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
+
import multiprocessing as mp
|
11 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
12 |
+
from llm_api_utils import init_openai_with_api_key, PRICE_PER_1K_TOKENS
|
13 |
+
import multiprocessing as mp
|
14 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
15 |
+
from langchain_community.vectorstores import FAISS
|
16 |
+
from langchain_openai import OpenAIEmbeddings
|
17 |
+
from langchain.docstore.in_memory import InMemoryDocstore
|
18 |
+
|
19 |
+
|
20 |
+
class ClaimToEvidence:
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
embedding_model="ada",
|
24 |
+
limit=0,
|
25 |
+
refresh=False,
|
26 |
+
processes=1,
|
27 |
+
num_chunks_per_worker=50,
|
28 |
+
filter_str="",
|
29 |
+
text_embedding_chunk_size=500,
|
30 |
+
k_nearest_neighbours=3,
|
31 |
+
):
|
32 |
+
self.embedding_model = embedding_model
|
33 |
+
self.limit = limit
|
34 |
+
self.refresh = refresh
|
35 |
+
self.processes = processes
|
36 |
+
self.num_chunks_per_worker = num_chunks_per_worker
|
37 |
+
self.filter_str = filter_str
|
38 |
+
self.text_embedding_chunk_size = text_embedding_chunk_size
|
39 |
+
self.k_nearest_neighbours = k_nearest_neighbours
|
40 |
+
|
41 |
+
@beartype
|
42 |
+
def link_claims_to_evidence(
|
43 |
+
self,
|
44 |
+
metas,
|
45 |
+
faiss_db,
|
46 |
+
):
|
47 |
+
embedding_function = OpenAIEmbeddings()
|
48 |
+
|
49 |
+
# build a query from the claim and source fragment
|
50 |
+
queries = [
|
51 |
+
f"Evidence for {x['claim']} (Based on {x['verbatim_quote']})" for x in metas
|
52 |
+
]
|
53 |
+
encoding = tiktoken.encoding_for_model(self.embedding_model)
|
54 |
+
|
55 |
+
num_tokens = len(encoding.encode(" ".join(queries)))
|
56 |
+
print(
|
57 |
+
f"Step6: Estimated cost: {num_tokens * PRICE_PER_1K_TOKENS[self.embedding_model]['embed'] / 1000:.2f} USD"
|
58 |
+
)
|
59 |
+
k_nearest_neighbours = min(
|
60 |
+
len(faiss_db.index_to_docstore_id), self.k_nearest_neighbours
|
61 |
+
)
|
62 |
+
|
63 |
+
for text_query, meta in zip(queries, metas):
|
64 |
+
docs_and_scores = faiss_db.similarity_search_with_relevance_scores(
|
65 |
+
text_query, k=k_nearest_neighbours
|
66 |
+
)
|
67 |
+
|
68 |
+
# allow evidence to be serialised
|
69 |
+
evidences = []
|
70 |
+
for document, score in docs_and_scores:
|
71 |
+
evidence = {
|
72 |
+
"chunk_tag": document.metadata["chunk_tag"],
|
73 |
+
"link": document.metadata["link"],
|
74 |
+
"query": document.metadata["query"],
|
75 |
+
"date_accessed": document.metadata["date_accessed"],
|
76 |
+
"text": document.page_content,
|
77 |
+
"similarity_score": float(score),
|
78 |
+
}
|
79 |
+
evidences.append(evidence)
|
80 |
+
|
81 |
+
meta["evidences"] = evidences
|
82 |
+
meta["embedded_query_used_to_find_evidence"] = text_query
|
83 |
+
|
84 |
+
print(f"Returning {len(metas)} queries with supporting evidence")
|
85 |
+
return metas
|
step7_api_check_claims_against_evidence.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import multiprocessing as mp
|
4 |
+
from zsvision.zs_multiproc import starmap_with_kwargs
|
5 |
+
from pathlib import Path
|
6 |
+
from zsvision.zs_utils import BlockTimer
|
7 |
+
from typing import List, Dict
|
8 |
+
from llm_api_utils import (
|
9 |
+
call_openai_with_exponetial_backoff,
|
10 |
+
estimate_cost_of_text_generation_api_call,
|
11 |
+
init_openai_with_api_key,
|
12 |
+
)
|
13 |
+
|
14 |
+
|
15 |
+
class CheckClaimAgainstEvidence:
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
temperature=0.0,
|
19 |
+
max_num_evidences=2,
|
20 |
+
model="gpt-3.5-turbo",
|
21 |
+
src_dir=Path("data/raw"),
|
22 |
+
dest_dir=Path("data/extracted_claims"),
|
23 |
+
filter_str="",
|
24 |
+
processes=1,
|
25 |
+
refresh=False,
|
26 |
+
):
|
27 |
+
self.temperature = temperature
|
28 |
+
self.max_num_evidences = max_num_evidences
|
29 |
+
self.model = model
|
30 |
+
self.src_dir = src_dir
|
31 |
+
self.dest_dir = dest_dir
|
32 |
+
self.filter_str = filter_str
|
33 |
+
self.processes = processes
|
34 |
+
self.refresh = refresh
|
35 |
+
|
36 |
+
def check_claim_against_evidence(
|
37 |
+
self,
|
38 |
+
claim: str,
|
39 |
+
evidences: List[Dict[str, str]],
|
40 |
+
):
|
41 |
+
init_openai_with_api_key()
|
42 |
+
evidence_str = ""
|
43 |
+
for evidence in evidences:
|
44 |
+
# avoid linebreaks in each piece of evidence, else it can create a confusing prompt
|
45 |
+
text_evidence = evidence["text"].replace("\n", " ")
|
46 |
+
evidence_str += f"{text_evidence}\n"
|
47 |
+
evidence_str += f"URL: {evidence['link']}'\n"
|
48 |
+
evidence_str += f"Date accessed: {evidence['date_accessed']}\n\n"
|
49 |
+
|
50 |
+
prompt = f"""\
|
51 |
+
Your task is to assess whether a claim is correct based on the given pieces of evidence.
|
52 |
+
|
53 |
+
Your answer should be in json format as follows:
|
54 |
+
{{
|
55 |
+
"verdict": "<verdict>",
|
56 |
+
"justification": "<justification for the verdict>",
|
57 |
+
"quotes": ["<most relevant verbatim quotes from evidence>"],
|
58 |
+
"URLs": "<URL sources for verbatim quotes>",
|
59 |
+
"date_accessed": "<access dates for URL quotes>"
|
60 |
+
}}
|
61 |
+
The <verdict> label should be one of the following:
|
62 |
+
"Fully supported", "Partially supported", "Unsupported"
|
63 |
+
|
64 |
+
When quoting the relevant sentence from the evidence, be careful to copy it **EXACTLY** (with no edits).
|
65 |
+
---
|
66 |
+
## Example
|
67 |
+
|
68 |
+
**Claim**:
|
69 |
+
Hannah Arendt was born in 1906.
|
70 |
+
|
71 |
+
**Pieces of evidence**:
|
72 |
+
Hannah Arendt was a 20th-century German-Jewish political thinker and philosopher. She was born in Linden, Hanover, Germany in 1906. When she was three her family moved to Königsberg so that her father’s syphilis could be treated. He died when she was seven years old. Königsberg was where Immanuel Kant was born, right?
|
73 |
+
|
74 |
+
Königsberg was where Immanuel Kant was born, right?
|
75 |
+
URL: https://fivebooks.com/best-books/hannah-arendt-samantha-rose-hill/'
|
76 |
+
Date accessed: 2023-05-10
|
77 |
+
|
78 |
+
Hannah Arendt was born as Johanna Arendt in 1906, in the Wilhelmine period. Her German Jewish family were comfortable, educated and secular in Linden, Prussia (now a part of Hanover). They were merchants of Russian extraction from Königsberg.[a] Her grandparents were members of the Reform Jewish community. Her paternal grandfather, Max Arendt [de] (1843–1913), was a prominent businessman, local politician, a leader of the Königsberg Jewish community and a member of the Centralverein deutscher
|
79 |
+
URL: https://en.wikipedia.org/wiki/Hannah_Arendt'
|
80 |
+
Date accessed: 2023-05-10
|
81 |
+
|
82 |
+
|
83 |
+
**Assessment**:
|
84 |
+
{{
|
85 |
+
"verdict": "Fully supported",
|
86 |
+
"justification": "The claim about Hannah Arendt's birth date is directly supported by the evidence."
|
87 |
+
"quote": "Hannah Arendt was born as Johanna Arendt in 1906, in the Wilhelmine period.",
|
88 |
+
"URL": "https://en.wikipedia.org/wiki/Hannah_Arendt",
|
89 |
+
"date_accessed": "2023-05-10"
|
90 |
+
}}
|
91 |
+
---
|
92 |
+
**Claim**:
|
93 |
+
{claim}
|
94 |
+
|
95 |
+
**Pieces of evidence**:
|
96 |
+
{evidence_str}
|
97 |
+
**Assessment**:\
|
98 |
+
"""
|
99 |
+
persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
|
100 |
+
system_message = {"role": "system", "content": persona}
|
101 |
+
user_message = {"role": "user", "content": prompt}
|
102 |
+
messages = [system_message, user_message]
|
103 |
+
|
104 |
+
with BlockTimer(
|
105 |
+
f"Using OpenAI API to check claims against evidence {self.model}"
|
106 |
+
):
|
107 |
+
response = call_openai_with_exponetial_backoff(
|
108 |
+
model=self.model,
|
109 |
+
temperature=self.temperature,
|
110 |
+
messages=messages,
|
111 |
+
response_format={"type": "json_object"},
|
112 |
+
)
|
113 |
+
|
114 |
+
cost = estimate_cost_of_text_generation_api_call(
|
115 |
+
model=self.model, response=response, verbose=True
|
116 |
+
)
|
117 |
+
|
118 |
+
assessment = response.choices[0].message.content
|
119 |
+
assessment_dict = json.loads(assessment)
|
120 |
+
return {"assessment": assessment_dict, "cost": cost}
|
121 |
+
|
122 |
+
def check_claims_against_evidence(self, claims_with_evidence):
|
123 |
+
"""
|
124 |
+
Checks claims against evidence.
|
125 |
+
"""
|
126 |
+
kwarg_list = []
|
127 |
+
results = []
|
128 |
+
for idx, item in enumerate(claims_with_evidence):
|
129 |
+
kwarg_list.append(
|
130 |
+
{
|
131 |
+
"claim": item["claim"],
|
132 |
+
"evidences": item["evidences"][: self.max_num_evidences],
|
133 |
+
}
|
134 |
+
)
|
135 |
+
if self.processes == 1:
|
136 |
+
for kwargs in kwarg_list:
|
137 |
+
results.append(self.check_claim_against_evidence(**kwargs))
|
138 |
+
else: # multiprocess
|
139 |
+
func = self.check_claim_against_evidence
|
140 |
+
with mp.Pool(processes=self.processes) as pool:
|
141 |
+
results = starmap_with_kwargs(
|
142 |
+
pool=pool, func=func, kwargs_iter=kwarg_list
|
143 |
+
)
|
144 |
+
costs = [result["cost"] for result in results]
|
145 |
+
print(f"Total cost: {sum(costs)} USD")
|
146 |
+
assessed_claims = []
|
147 |
+
for result, item in zip(results, claims_with_evidence):
|
148 |
+
item["assessment"] = result["assessment"]
|
149 |
+
item["verdict_model"] = self.model
|
150 |
+
assessed_claims.append(item)
|
151 |
+
|
152 |
+
print(f"Writing {len(assessed_claims)} assessed claims")
|
153 |
+
return assessed_claims
|
step8_api_format_fact_checked_document.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import re
|
3 |
+
from collections import defaultdict
|
4 |
+
import json
|
5 |
+
from text_utils import find_matching_indices
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
|
9 |
+
class FormatDocument:
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
footnote_style: str,
|
13 |
+
temperature=0.0,
|
14 |
+
model="gpt-4",
|
15 |
+
dest_dir=Path("data/extracted_claims"),
|
16 |
+
filter_str="",
|
17 |
+
refresh=False,
|
18 |
+
):
|
19 |
+
self.temperature = temperature
|
20 |
+
self.model = model
|
21 |
+
self.dest_dir = dest_dir
|
22 |
+
self.filter_str = filter_str
|
23 |
+
self.refresh = refresh
|
24 |
+
self.footnote_style = footnote_style
|
25 |
+
|
26 |
+
def cleanup_explanation(self, claim_assessment: dict, mode: str) -> str:
|
27 |
+
claim = claim_assessment["claim"]
|
28 |
+
assessment = claim_assessment["assessment"]
|
29 |
+
justification = assessment["justification"]
|
30 |
+
category = assessment["verdict"]
|
31 |
+
urls = assessment["URLs"]
|
32 |
+
date_accessed = assessment["date_accessed"]
|
33 |
+
|
34 |
+
prefixes = {
|
35 |
+
"Fully supported": "✅",
|
36 |
+
"Partially supported": "❓",
|
37 |
+
"Unsupported": "❗",
|
38 |
+
}
|
39 |
+
prefix = prefixes[category]
|
40 |
+
quotes = ",".join(f'"{quote}"' for quote in assessment["quotes"])
|
41 |
+
# Sometimes, the verdict justification contains newlines , which messes up the formatting of footnotes.
|
42 |
+
justification = justification.replace("\n", "")
|
43 |
+
|
44 |
+
if mode == "terse":
|
45 |
+
footnote = f"Claim: {claim} 👉 {category} {urls}"
|
46 |
+
elif mode == "verbose":
|
47 |
+
footnote = f"Claim: {claim} 👉 {category} {quotes} {justification}, URLs: {urls}, date accessed: {date_accessed}"
|
48 |
+
footnote = f"{prefix} {footnote}"
|
49 |
+
return footnote
|
50 |
+
|
51 |
+
def reformat_document_to_include_claims(
|
52 |
+
self,
|
53 |
+
original_text,
|
54 |
+
fact_verdicts,
|
55 |
+
footnote_style=None,
|
56 |
+
):
|
57 |
+
bibliography = []
|
58 |
+
footnote_markers_to_insert = []
|
59 |
+
statistics = defaultdict(int)
|
60 |
+
number_of_facts_checked = 0
|
61 |
+
if footnote_style:
|
62 |
+
self.footnote_style = footnote_style
|
63 |
+
for fact_idx, claim_assessment in enumerate(fact_verdicts):
|
64 |
+
if self.footnote_style == "terse":
|
65 |
+
footnote_str = f"{fact_idx + 1}"
|
66 |
+
elif self.footnote_style == "verbose":
|
67 |
+
footnote_str = claim_assessment["claim"].replace(" ", "-")
|
68 |
+
# footnote markers cannot contain much punctuation or commas in Jekyll
|
69 |
+
# (even though this is valid in GitHub-flavoured markdown)
|
70 |
+
for char in [
|
71 |
+
",",
|
72 |
+
".",
|
73 |
+
'"',
|
74 |
+
"'",
|
75 |
+
":",
|
76 |
+
";",
|
77 |
+
"(",
|
78 |
+
")",
|
79 |
+
"[",
|
80 |
+
"]",
|
81 |
+
"{",
|
82 |
+
"}",
|
83 |
+
"*",
|
84 |
+
]:
|
85 |
+
footnote_str = footnote_str.replace(char, "")
|
86 |
+
|
87 |
+
explanation = self.cleanup_explanation(
|
88 |
+
claim_assessment, mode=self.footnote_style
|
89 |
+
)
|
90 |
+
footnote_marker = f"[^{footnote_str}]"
|
91 |
+
query = claim_assessment["verbatim_quote"]
|
92 |
+
|
93 |
+
assert (
|
94 |
+
original_text.count(query) == 1
|
95 |
+
), f"Found {original_text.count(query)} matches for {query}, rather than 1"
|
96 |
+
start_pos = original_text.find(query)
|
97 |
+
assert start_pos != -1, f"Could not find {query} in {original_text}"
|
98 |
+
end_pos = start_pos + len(query)
|
99 |
+
footnote_markers_to_insert.append((end_pos, footnote_marker))
|
100 |
+
verdict_category = claim_assessment["assessment"]["verdict"]
|
101 |
+
statistics[verdict_category] += 1
|
102 |
+
number_of_facts_checked += 1
|
103 |
+
bibliography.append(f"{footnote_marker}: {explanation} ")
|
104 |
+
|
105 |
+
# perform insertions in reverse order so that the indices don't get messed up
|
106 |
+
modified_text = original_text
|
107 |
+
for char_pos, footnote_marker in sorted(
|
108 |
+
footnote_markers_to_insert, reverse=True
|
109 |
+
):
|
110 |
+
modified_text = (
|
111 |
+
modified_text[:char_pos] + footnote_marker + modified_text[char_pos:]
|
112 |
+
)
|
113 |
+
|
114 |
+
modified_text += "\n\n"
|
115 |
+
modified_text += "\n".join(bibliography)
|
116 |
+
|
117 |
+
# assert number_of_facts_checked != 0, "No facts were checked"
|
118 |
+
if number_of_facts_checked == 0:
|
119 |
+
print("No objective facts were found.")
|
120 |
+
modified_text = "No clear-cut objective claims were detected."
|
121 |
+
return modified_text
|
subjective_claims.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Arendt's experiences during this time influenced her work on totalitarianism and human rights.
|
text_utils.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import Dict
|
3 |
+
import unittest
|
4 |
+
|
5 |
+
|
6 |
+
def parse_passage_quote_and_claim(passage_quote_and_claim: str) -> Dict[str, str]:
|
7 |
+
"""Parse the quote and claim from a string, where the string is of the form:
|
8 |
+
|
9 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] <passage quote for claim> [CLAIM] <claim>
|
10 |
+
"""
|
11 |
+
|
12 |
+
if not passage_quote_and_claim.startswith("[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]"):
|
13 |
+
raise ValueError(f"Invalid input format: {passage_quote_and_claim}")
|
14 |
+
|
15 |
+
parts = passage_quote_and_claim.split("[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]")
|
16 |
+
source_parts = parts[1].split("[CLAIM]")
|
17 |
+
|
18 |
+
# If there aren't exactly two parts after splitting by [CLAIM], the format is invalid
|
19 |
+
if len(source_parts) != 2:
|
20 |
+
raise ValueError(f"Invalid input format: {passage_quote_and_claim}")
|
21 |
+
|
22 |
+
passage_quote_for_claim = source_parts[0].strip()
|
23 |
+
claim = source_parts[1].strip()
|
24 |
+
return {"verbatim_quote": passage_quote_for_claim, "claim": claim}
|
25 |
+
|
26 |
+
|
27 |
+
def is_unique_verbatim_quote(verbatim_quote: str, original_passage: str):
|
28 |
+
"""Check if the verbatim quote is an exact quote from the original passage."""
|
29 |
+
return original_passage.count(verbatim_quote) == 1
|
30 |
+
|
31 |
+
|
32 |
+
def find_matching_indices(query: str, original_text: str):
|
33 |
+
# Function to remove markdown links and create an index map
|
34 |
+
def remove_links(text):
|
35 |
+
index_map = []
|
36 |
+
result = []
|
37 |
+
markdown_links = re.finditer(r"\[([^\]]+)\]\([^)]+\)", text)
|
38 |
+
|
39 |
+
prev_end = 0
|
40 |
+
for match in markdown_links:
|
41 |
+
result.append(text[prev_end : match.start()])
|
42 |
+
index_map.extend(range(prev_end, match.start()))
|
43 |
+
result.append(match.group(1))
|
44 |
+
index_map.extend(range(match.start(1), match.end(1)))
|
45 |
+
prev_end = match.end()
|
46 |
+
|
47 |
+
result.append(text[prev_end:])
|
48 |
+
index_map.extend(range(prev_end, len(text)))
|
49 |
+
|
50 |
+
return "".join(result), index_map
|
51 |
+
|
52 |
+
# Remove markdown links from the original text and create an index map
|
53 |
+
cleaned_text, index_map = remove_links(original_text)
|
54 |
+
|
55 |
+
# Remove markdown links from the query
|
56 |
+
cleaned_query, _ = remove_links(query)
|
57 |
+
|
58 |
+
# Find the start index of the cleaned query in the cleaned text
|
59 |
+
start = cleaned_text.find(cleaned_query)
|
60 |
+
|
61 |
+
# If the query is not found, return an empty list
|
62 |
+
if start == -1:
|
63 |
+
return []
|
64 |
+
|
65 |
+
# Add the query length to get the end index
|
66 |
+
end = start + len(cleaned_query)
|
67 |
+
|
68 |
+
# Use the index map to find the corresponding start and end indices in the original text
|
69 |
+
original_start = index_map[start]
|
70 |
+
original_end = index_map[end - 1] + 1
|
71 |
+
|
72 |
+
return [(original_start, original_end)]
|
73 |
+
|
74 |
+
|
75 |
+
class TestCases(unittest.TestCase):
|
76 |
+
def test_find_matching_indices(self):
|
77 |
+
"""Test the find_matching_indices() function.
|
78 |
+
This function should return a list of matches, where each match is a tuple of (start, end) indices.
|
79 |
+
|
80 |
+
The start and end indices should be the character positions of the query in the original_text, accounting
|
81 |
+
for the fact that markdown links should be ignored when performing the match.
|
82 |
+
|
83 |
+
"""
|
84 |
+
test_cases = [
|
85 |
+
{
|
86 |
+
"query": "Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like Immanuel Kant and Edmund Husserl.",
|
87 |
+
"original": "Arendt's later works, sought to further unravel the complexities of power and rebellion. Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work).\n\n## A Lasting Legacy",
|
88 |
+
"expected": "Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work).",
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"query": "I went to the sea side (at the weekend).",
|
92 |
+
"original": "I woke up. Then I went to the sea side (at the weekend). Then I went home.",
|
93 |
+
"expected": "I went to the sea side (at the weekend).",
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"query": "no merger with the [solar farm] company",
|
97 |
+
"original": "There would be no merger with the [solar farm] company.",
|
98 |
+
"expected": "no merger with the [solar farm] company",
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"query": "with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work)",
|
102 |
+
"original": "\n\n## Fleeing Germany and the Road to Academia\n\nWith the rise of the Nazi regime in the 1930s, Arendt's Jewish heritage put her in grave danger. She fled Germany in 1933 and settled in Paris, where she became involved with a number of political and social organizations advocating for Jewish refugees. In 1940, she was interned in a French camp as an enemy alien, but managed to escape and eventually make her way to the United States in 1941.\n\nArendt's experiences during this time would deeply influence her work on totalitarianism and human rights. In New York, she began to immerse herself in academic life, working as an editor, journalist, and lecturer. Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy.\n\n## A Life Of Controversial, Influential Works\n\nThroughout her career, Arendt wrote a number of seminal, and controversial, works. *The Human Condition* (1958) examined the role of politics in modern societies and introduced the concept of \"the public realm\" – the space where individuals act and participate in political life. This exploration of freedom and action would become a recurring theme in her writings.\n\nHer 1963 publication, *Eichmann in Jerusalem: A Report on the Banality of Evil*, based on her coverage of Adolf Eichmann's trial, ignited significant controversy. Arendt argued that Eichmann, a key architect of the Holocaust, was not a monster but rather an ordinary bureaucrat who unquestioningly followed orders. The idea of the \"banality of evil\" continues to influence discussions on the nature of evil and moral responsibility.\n\nArendt's later works, such as *On Revolution* (1963) and *Between Past and Future* (1968), sought to further unravel the complexities of power, authority, and rebellion. Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work).\n\n## A Lasting Legacy\n\nHannah Arendt died in 1975, but her work remains as relevant as ever.",
|
103 |
+
"expected": "with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work)",
|
104 |
+
},
|
105 |
+
]
|
106 |
+
|
107 |
+
for test_case in test_cases:
|
108 |
+
matches = find_matching_indices(
|
109 |
+
query=test_case["query"], original_text=test_case["original"]
|
110 |
+
)
|
111 |
+
assert (
|
112 |
+
len(matches) == 1
|
113 |
+
), f"Expected exactly one match, but found {len(matches)}"
|
114 |
+
result = test_case["original"][matches[0][0] : matches[0][1]]
|
115 |
+
msg = (
|
116 |
+
f"Expected\n\n{test_case['expected']}\n\nbut instead found\n\n{result}"
|
117 |
+
)
|
118 |
+
self.assertEqual(result, test_case["expected"], msg)
|
119 |
+
print(f"Passed all tests for find_matching_indices()")
|
120 |
+
|
121 |
+
def test_parse_passage_quote_and_claim(self):
|
122 |
+
"""Test the following function:
|
123 |
+
parse_passage_quote_and_claim(passage_quote_and_claim: str) -> {"verbatim_quote": str, "claim": str}
|
124 |
+
|
125 |
+
The passage quote and claim should take the form:
|
126 |
+
[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] <passage quote for claim> [CLAIM] <claim>
|
127 |
+
"""
|
128 |
+
test_cases = [
|
129 |
+
{
|
130 |
+
"passage_quote_and_claim": "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Hannah Arendt [was born in] 1906 in Linden, Germany [CLAIM] Hannah Arendt was born in Linden, Germany.",
|
131 |
+
"expected": {
|
132 |
+
"verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany",
|
133 |
+
"claim": "Hannah Arendt was born in Linden, Germany.",
|
134 |
+
},
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"passage_quote_and_claim": "Something [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Hannah Arendt [was born in] 1906 in Linden, Germany [CLAIM] Hannah Arendt was born in Linden, Germany.",
|
138 |
+
"expected": "Exception",
|
139 |
+
},
|
140 |
+
]
|
141 |
+
for test_case in test_cases:
|
142 |
+
expected = test_case["expected"]
|
143 |
+
if expected == "Exception":
|
144 |
+
self.assertRaises(
|
145 |
+
ValueError,
|
146 |
+
parse_passage_quote_and_claim,
|
147 |
+
test_case["passage_quote_and_claim"],
|
148 |
+
)
|
149 |
+
else:
|
150 |
+
parsed = parse_passage_quote_and_claim(
|
151 |
+
passage_quote_and_claim=test_case["passage_quote_and_claim"]
|
152 |
+
)
|
153 |
+
self.assertEqual(parsed["verbatim_quote"], expected["verbatim_quote"])
|
154 |
+
|
155 |
+
def test_is_unique_verbatim_quote_check(self):
|
156 |
+
"""Test the following function:
|
157 |
+
is_unique_verbatim_quote_check(verbatim_quote: str) -> bool
|
158 |
+
|
159 |
+
This function should return True if the verbatim quote is indeed a quote and is unique, and false otherwise.
|
160 |
+
|
161 |
+
"""
|
162 |
+
test_cases = [
|
163 |
+
{
|
164 |
+
"verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany",
|
165 |
+
"original_passage": "Hannah Arendt [was born in] 1906 in Linden, Germany at a time when...",
|
166 |
+
"expected": True,
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany",
|
170 |
+
"original_passage": "Hannah Arendt [wasn't born in] 1906 in Linden, Germany at a time when...",
|
171 |
+
"expected": False,
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany. Hannah Arendt was a person.",
|
175 |
+
"original_passage": "Hannah Arendt",
|
176 |
+
"expected": False,
|
177 |
+
},
|
178 |
+
]
|
179 |
+
for test_case in test_cases:
|
180 |
+
result = is_unique_verbatim_quote(
|
181 |
+
verbatim_quote=test_case["verbatim_quote"],
|
182 |
+
original_passage=test_case["original_passage"],
|
183 |
+
)
|
184 |
+
self.assertEqual(result, test_case["expected"])
|
185 |
+
|
186 |
+
|
187 |
+
if __name__ == "__main__":
|
188 |
+
unittest.main()
|
utils.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from functools import lru_cache
|
3 |
+
from langchain_community.utilities import GoogleSearchAPIWrapper
|
4 |
+
|
5 |
+
@lru_cache(maxsize=2)
|
6 |
+
def get_search_wrapper():
|
7 |
+
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_CLOUD_API_KEY")
|
8 |
+
|
9 |
+
os.environ["GOOGLE_CSE_ID"] = os.environ.get("GOOGLE_CUSTOM_SEARCH_ENGINE_ID")
|
10 |
+
return GoogleSearchAPIWrapper()
|
11 |
+
|
12 |
+
|
13 |
+
def get_google_search_results(query_str: str, num_results: int):
|
14 |
+
google_search_tool = get_search_wrapper()
|
15 |
+
search_results = google_search_tool.results(
|
16 |
+
query=query_str, num_results=num_results
|
17 |
+
)
|
18 |
+
return search_results
|