vladbogo commited on
Commit
7a8b33f
1 Parent(s): 49e2e17

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ .venv
3
+ data
4
+ __pycache__/
LICENCE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 vladbogo, albanie, ioanacroi, abuonomo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,31 @@
1
  ---
2
  title: Filtir
3
- emoji: 📚
4
- colorFrom: pink
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 4.19.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
  title: Filtir
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.19.0
 
 
6
  ---
7
+ # Filtir - fact checking pipeline
8
+
9
+ This repo contains the Filtir pipeline for claim extraction and fact-checking.
10
+
11
+ ## Prerequisites
12
+
13
+ ### Create and prepare venv
14
+ ```bash
15
+ python3 -m venv .venv
16
+ source .venv/bin/activate
17
+ pip install -r requirements.txt
18
+ ```
19
+
20
+ ### Setup keys
21
+ In order to run the code you need to set up the following keys and add them to .env:
22
+
23
+ - OPENAI_API_KEY - used to call the OpenAI API
24
+ - COHERE_API_KEY and WEAVIATE_API_KEY - used for Wikipedia search
25
+ - GOOGLE_CLOUD_API_KEY and GOOGLE_CUSTOM_SEARCH_ENGINE_ID - used for Google search
26
+
27
+ ## Run the pipeline
28
 
29
+ ```bash
30
+ python run_pipeline.py --file example.txt --model gpt-4-1106-preview
31
+ ```
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gradio_app.py
2
+
3
+ import gradio as gr
4
+ from run_pipeline import get_fact_checked
5
+
6
+
7
+ def fact_check_function(text, model):
8
+ # Assume the text is already read from the user input, so we don't need to open a file here
9
+ out = get_fact_checked(text, mode="slow", model=model)
10
+ return out["fact_checked_md"]
11
+
12
+
13
+ def create_gradio_interface():
14
+ iface = gr.Interface(
15
+ allow_flagging=False,
16
+ fn=fact_check_function,
17
+ inputs=[
18
+ gr.Textbox(
19
+ lines=10, placeholder="Enter text to fact-check...", label="Input Text"
20
+ ),
21
+ gr.Dropdown(choices=["gpt-4-1106-preview"], label="Model"),
22
+ ],
23
+ outputs=gr.Markdown(label="Filtir Output"),
24
+ )
25
+ return iface
26
+
27
+
28
+ if __name__ == "__main__":
29
+ iface = create_gradio_interface()
30
+ iface.launch()
example.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Arendt fled Germany in 1933.
fetch_evidence.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import json5
5
+ import argparse
6
+ from pathlib import Path
7
+ import multiprocessing as mp
8
+ from zsvision.zs_multiproc import starmap_with_kwargs
9
+ from pipeline_paths import PIPELINE_PATHS
10
+ from datetime import datetime
11
+ import urllib.robotparser
12
+ import urllib.parse
13
+ from utils import get_google_search_results
14
+
15
+ import time
16
+ from random import randint
17
+ from fake_useragent import UserAgent
18
+ from newspaper import Article, Config
19
+
20
+
21
+ def can_scrape(url, user_agent="*"):
22
+ rp = urllib.robotparser.RobotFileParser()
23
+ rp.set_url(f"{url.scheme}://{url.netloc}/robots.txt")
24
+ # be conservative - if we can't find robots.txt, don't scrapes
25
+ try:
26
+ rp.read()
27
+ ok_to_scrape = rp.can_fetch(user_agent, url.geturl())
28
+ except urllib.error.URLError:
29
+ ok_to_scrape = False
30
+ return ok_to_scrape
31
+
32
+
33
+ def fetch_search_results_to_gather_evidence(
34
+ args,
35
+ idx: int,
36
+ total: int,
37
+ search_results_dest_path: Path,
38
+ queryset: dict,
39
+ ):
40
+ user_agent = UserAgent()
41
+ config = Config()
42
+ config.fetch_images = False
43
+ print(f"Query {idx}/{total}")
44
+
45
+ search_results_dest_path.parent.mkdir(exist_ok=True, parents=True)
46
+
47
+ # check if we already have search_results for this title
48
+ if search_results_dest_path.exists() and not args.refresh:
49
+ print(f"Found existing search results at {search_results_dest_path}, skipping")
50
+ return 0
51
+
52
+ headers = {
53
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
54
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
55
+ "Accept-Language": "en-US,en;q=0.5",
56
+ "DNT": "1",
57
+ "Connection": "keep-alive",
58
+ "Upgrade-Insecure-Requests": "1",
59
+ }
60
+
61
+ # we assume some sites won't permit scraping, so we'll skip these
62
+ num_results = args.num_search_results_to_keep + 5
63
+ results = {}
64
+
65
+ for item in queryset:
66
+ if item["search_query"] == "no suitable query":
67
+ item["search_results"] = []
68
+ continue
69
+
70
+ search_results = get_google_search_results(
71
+ query_str=item["search_query"], num_results=num_results
72
+ )
73
+
74
+ if search_results == [{"Result": "No good Google Search Result was found"}]:
75
+ item["search_results"] = []
76
+ continue
77
+
78
+ parsed_results = []
79
+ for search_result in search_results:
80
+ if not can_scrape(
81
+ urllib.parse.urlparse(search_result["link"]), user_agent="MyScraper"
82
+ ):
83
+ print(
84
+ f"Skipping {search_result['link']} because it doesn't permit scraping"
85
+ )
86
+ continue
87
+ try:
88
+ config.browser_user_agent = user_agent.random
89
+ article = Article(search_result["link"], language="en", config=config)
90
+ article.download()
91
+ article.parse()
92
+ text = article.text
93
+ except Exception as e:
94
+ print(f"Error parsing article: {e}, trying with requests.get...")
95
+ try:
96
+ response = requests.get(
97
+ search_result["link"], timeout=15, headers=headers
98
+ )
99
+ html = response.text
100
+ soup = BeautifulSoup(html, features="html.parser")
101
+ text = soup.get_text()
102
+ except Exception as exception:
103
+ print(f"Error parsing article: {exception}")
104
+ raise exception
105
+
106
+ search_result["text"] = text
107
+ parsed_results.append(search_result)
108
+ if len(parsed_results) == args.num_search_results_to_keep:
109
+ break
110
+ item["search_results"] = parsed_results
111
+
112
+ # update the queryset with new information
113
+ date_str = datetime.now().strftime("%Y-%m-%d")
114
+ results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}
115
+
116
+ print(
117
+ f"Writing web pages for search results for {len(queryset)} queries to {search_results_dest_path}"
118
+ )
119
+ with open(search_results_dest_path, "w") as f:
120
+ f.write(json.dumps(results, indent=4, sort_keys=True))
121
+
122
+
123
+ def main():
124
+ args = parse_args()
125
+ search_query_paths = list(
126
+ PIPELINE_PATHS["search_queries_for_evidence"].glob("**/*.json")
127
+ )
128
+
129
+ if args.limit:
130
+ print(f"Limited to {args.limit} search querysets")
131
+ search_query_paths = search_query_paths[: args.limit]
132
+
133
+ kwarg_list = []
134
+ for idx, search_query_path in enumerate(search_query_paths):
135
+ rel_path = search_query_path.relative_to(
136
+ PIPELINE_PATHS["search_queries_for_evidence"]
137
+ )
138
+ dest_path = PIPELINE_PATHS["google_search_results_evidence"] / rel_path
139
+
140
+ if dest_path.exists() and not args.refresh:
141
+ print(f"For {search_query_path}, found results at {dest_path}, skipping")
142
+ continue
143
+
144
+ with open(search_query_path, "r") as f:
145
+ queryset = json.load(f)
146
+ kwarg_list.append(
147
+ {
148
+ "idx": idx,
149
+ "total": len(search_query_paths),
150
+ "search_results_dest_path": dest_path,
151
+ "args": args,
152
+ "queryset": queryset,
153
+ }
154
+ )
155
+
156
+ # provide the total number of queries to each process
157
+ for kwargs in kwarg_list:
158
+ kwargs["total"] = len(kwarg_list)
159
+
160
+ # single process
161
+ if args.processes == 1:
162
+ cost = 0
163
+ for kwargs in kwarg_list:
164
+ fetch_search_results_to_gather_evidence(**kwargs)
165
+ else: # multiprocess
166
+ func = fetch_search_results_to_gather_evidence
167
+ with mp.Pool(processes=args.processes) as pool:
168
+ starmap_with_kwargs(pool=pool, func=func, kwargs_iter=kwarg_list)
169
+
170
+
171
+ def parse_args():
172
+ parser = argparse.ArgumentParser()
173
+ parser.add_argument(
174
+ "--model", default="gpt-3.5-turbo", choices=["gpt-4", "gpt-3.5-turbo"]
175
+ )
176
+ parser.add_argument("--limit", default=0, type=int)
177
+ parser.add_argument("--refresh", action="store_true")
178
+ parser.add_argument("--num_search_results_to_keep", type=int, default=3)
179
+ parser.add_argument("--processes", type=int, default=1)
180
+ return parser.parse_args()
181
+
182
+
183
+ if __name__ == "__main__":
184
+ main()
flagged/log.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Input Text,Model,Fact Checked Output,flag,username,timestamp
2
+ ,gpt-4-1106-preview,,,,2024-02-16 13:59:41.379589
3
+ The earth is flat,gpt-4-1106-preview,"The earth is flat[^The-earth-is-flat]
4
+
5
+ [^The-earth-is-flat]: ❗ Claim: The earth is flat. 👉 Unsupported ""Flat-Earth ideas are based on basic scientific misunderstandings that can be easily refuted."",""For most people, even those who have no physics background, the evidence for a spherical Earth is obvious."" The claim that the Earth is flat is directly contradicted by the evidence, which states that Flat-Earth ideas are based on basic scientific misunderstandings and that the evidence for a spherical Earth is obvious., URLs: https://physicsworld.com/a/fighting-flat-earth-theory/, date accessed: 2024-02-16 ",,,2024-02-16 14:00:40.524144
generate_search_queries.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from pathlib import Path
3
+ import numpy as np
4
+ from pipeline_paths import PIPELINE_PATHS
5
+ import json
6
+ from zsvision.zs_utils import BlockTimer
7
+ from typing import Dict, List
8
+ from llm_api_utils import (
9
+ call_openai_with_exponetial_backoff,
10
+ estimate_cost_of_text_generation_api_call,
11
+ init_openai_with_api_key,
12
+ )
13
+
14
+
15
+ def generate_search_queries(args, src_path: Path, dest_path: Path):
16
+ """
17
+ Generate a search query that can be used to verify a claim.
18
+ """
19
+ init_openai_with_api_key(api_key_path=args.api_key_path)
20
+ with open(src_path, "r") as f:
21
+ claims_and_sources = json.load(f)
22
+
23
+ # exclude subjective claims
24
+ original_num_claims = len(claims_and_sources)
25
+ claims_and_sources = [
26
+ claim_and_source
27
+ for claim_and_source in claims_and_sources
28
+ if claim_and_source["label"] == "objective"
29
+ ]
30
+ num_claims = len(claims_and_sources)
31
+ print(
32
+ f"Filtered from {original_num_claims} claims to {num_claims} objective claims"
33
+ )
34
+
35
+ # we limit the number of claims per api call (otherwise GPT-4 can choke)
36
+ num_batches = int(np.ceil(num_claims / args.max_claims_per_api_call))
37
+ claims_and_sources_batches = [
38
+ batch.tolist() for batch in np.array_split(claims_and_sources, num_batches)
39
+ ]
40
+ queries = []
41
+
42
+ all_claims_str = "\n".join([claim["claim"] for claim in claims_and_sources])
43
+
44
+ for idx, claims_and_sources_batch in enumerate(claims_and_sources_batches):
45
+ print(
46
+ f"Processing batch {idx+1} of {len(claims_and_sources_batches)} (containing {len(claims_and_sources_batch)} claims)"
47
+ )
48
+
49
+ claim_str = "\n".join([claim["claim"] for claim in claims_and_sources_batch])
50
+ num_batch_claims = len(claims_and_sources_batch)
51
+
52
+ # we provide the full list of claims as context (to help resolve ambiguity), but only ask for queries for the current batch
53
+ prompt = f"""\
54
+ You are working as part of a team and your individual task is to help check a subset of the following claims:\n
55
+ {all_claims_str}
56
+
57
+ Your individual task is as follows. \
58
+ For each of the {num_batch_claims} claims made below, provide a suitable Google search query that would enable a human to verify the claim. \
59
+ Note that Google can perform calculations and conversions, so you can use it to check numerical claims. \
60
+ If you think no Google query will be useful, then write "no suitable query". \
61
+ Each proposed Google search query should be on a separate line (do not prefix your queries with bullet points or numbers). \
62
+ There should be {num_batch_claims} queries in total.\n \
63
+
64
+ {claim_str}
65
+ """
66
+ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
67
+ system_message = {"role": "system", "content": persona}
68
+ user_message = {"role": "user", "content": prompt}
69
+ messages = [system_message, user_message]
70
+
71
+ with BlockTimer(f"Using OpenAI API to extract claims with {args.model}"):
72
+ response = call_openai_with_exponetial_backoff(
73
+ model=args.model,
74
+ temperature=args.temperature,
75
+ messages=messages,
76
+ )
77
+
78
+ cost = estimate_cost_of_text_generation_api_call(
79
+ model=args.model, response=response, verbose=True
80
+ )
81
+
82
+ proposed_queries = response.choices[0].message.content
83
+ batch_queries = proposed_queries.split("\n")
84
+ assert (
85
+ len(batch_queries) == num_batch_claims
86
+ ), f"Expected {num_batch_claims} queries, but got {len(queries)}"
87
+ print(f"Generated {len(batch_queries)} queries (cost: {cost:.4f} USD)")
88
+ queries.extend(batch_queries)
89
+
90
+ querysets = []
91
+ for claim_and_source, query in zip(claims_and_sources, queries):
92
+ queryset = {**claim_and_source, "search_query": query}
93
+ querysets.append(queryset)
94
+
95
+ dest_path.parent.mkdir(exist_ok=True, parents=True)
96
+ with open(dest_path, "w") as f:
97
+ json.dump(querysets, f, indent=4, sort_keys=True)
98
+
99
+
100
+ def main():
101
+ args = parse_args()
102
+
103
+ src_paths = list(
104
+ PIPELINE_PATHS["extracted_claims_with_classifications_dir"].glob("**/*.json")
105
+ )
106
+ print(
107
+ f"Found {len(src_paths)} claim files in {PIPELINE_PATHS['extracted_claims_with_classifications_dir']}"
108
+ )
109
+ dest_dir = PIPELINE_PATHS["search_queries_for_evidence"]
110
+
111
+ for src_path in src_paths:
112
+ dest_path = dest_dir / src_path.relative_to(
113
+ PIPELINE_PATHS["extracted_claims_with_classifications_dir"]
114
+ )
115
+ if not dest_path.exists() or args.refresh:
116
+ generate_search_queries(
117
+ args=args,
118
+ src_path=src_path,
119
+ dest_path=dest_path,
120
+ )
121
+
122
+
123
+ def parse_args():
124
+ parser = argparse.ArgumentParser()
125
+ parser.add_argument("--temperature", type=float, default=0)
126
+ parser.add_argument(
127
+ "--model", default="gpt-3.5-turbo", choices=["gpt-4", "gpt-3.5-turbo"]
128
+ )
129
+ parser.add_argument("--dest_dir", default="data/search_queries", type=Path)
130
+ parser.add_argument("--api_key_path", default="OPENAI_API_KEY.txt")
131
+ parser.add_argument("--max_claims_per_api_call", type=int, default=10)
132
+ parser.add_argument("--refresh", action="store_true")
133
+ return parser.parse_args()
134
+
135
+
136
+ if __name__ == "__main__":
137
+ main()
llm_api_utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import cohere
3
+ import os
4
+ import openai
5
+ import time
6
+ import backoff
7
+
8
+
9
+ PRICE_PER_1K_TOKENS = {
10
+ "gpt-4": {"prompt": 0.03, "completion": 0.06},
11
+ "gpt-4-1106-preview": {"prompt": 0.01, "completion": 0.03},
12
+ "gpt-3.5-turbo": {"prompt": 0.001, "completion": 0.002},
13
+ "ada": {"embed": 0.0004},
14
+ "text-embedding-ada-002": {"embed": 0.0001},
15
+ }
16
+
17
+
18
+ EMBEDDING_DIMENSIONS = {
19
+ "ada": 1536,
20
+ "text-embedding-ada-002": 1536,
21
+ }
22
+
23
+
24
+ def estimate_cost_of_text_generation_api_call(
25
+ model: str, response: dict, verbose: bool
26
+ ) -> float:
27
+ completion_tokens = response.usage.completion_tokens
28
+ prompt_tokens = response.usage.prompt_tokens
29
+ total_tokens = response.usage.total_tokens
30
+
31
+ prompt_cost = prompt_tokens / 1000 * PRICE_PER_1K_TOKENS[model]["prompt"]
32
+ completion_cost = (
33
+ completion_tokens / 1000 * PRICE_PER_1K_TOKENS[model]["completion"]
34
+ )
35
+ cost = prompt_cost + completion_cost
36
+
37
+ if verbose:
38
+ summary = f"""\
39
+ Used {prompt_tokens} prompt tokens, {completion_tokens} completion tokens, {total_tokens} total tokens
40
+ Esimated cost: {cost:.4f} USD
41
+ """
42
+ print(summary)
43
+ return cost
44
+
45
+
46
+ @backoff.on_exception(backoff.expo, (openai.RateLimitError, openai.APIConnectionError))
47
+ def call_openai_with_exponetial_backoff(**kwargs):
48
+ rand_sleep_in_secs = 5 * random.random()
49
+ time.sleep(rand_sleep_in_secs)
50
+ return openai.chat.completions.create(**kwargs)
51
+
52
+
53
+ def init_openai_with_api_key():
54
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
55
+
56
+
57
+ def init_cohere_with_api_key():
58
+ COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
59
+ co = cohere.Client(COHERE_API_KEY)
60
+ return co
objective_claims.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Hannah Arendt was born in 1906.
2
+ Hannah Arendt was raised in a Jewish family.
3
+ Arendt fled Germany in 1933.
pipeline_paths.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A module that contains the paths to the various files and folders used in the pipeline."""
2
+ from pathlib import Path
3
+
4
+ PIPELINE_PATHS = {
5
+ # google queries
6
+ "google_custom_search_engine_id_path": "google_custom_search_engine_id.txt",
7
+ # raw inputs to pipeline
8
+ "source_document_dir": "data/source_documents",
9
+ # claim extraction
10
+ "extracted_claims_dir": "data/extracted_claims",
11
+ "extracted_claims_with_anchor_fixes_dir": "data/extracted_claims_with_anchor_fixes",
12
+ "extracted_claims_with_classifications_dir": "data/extracted_with_classifications_claims",
13
+ "objective_claims_dir": "data/objective_claims",
14
+ # evidence gathering
15
+ "cohere_wikipedia_evidence": "data/evidence_gathering/cohere_wikipedia",
16
+ "google_search_results_evidence": "data/evidence_gathering/google_search_results",
17
+ "faiss_db_embeddings_for_evidence": "data/faiss_db_embeddings_for_evidence",
18
+ "web_evidence_chunks": "data/evidence_gathering/web_evidence_chunks",
19
+ # claim evaluation
20
+ "evaluated_claims_dir": "data/claim_evaluation/claim_verdicts",
21
+ # reformatted document
22
+ "fact_checked_document_dir": "data/fact_checked_documents",
23
+ }
24
+
25
+ PIPELINE_PATHS = {key: Path(value) for key, value in PIPELINE_PATHS.items()}
requirements.in ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ zsvision
2
+ openai
3
+ cohere
4
+ ipdb
5
+ weaviate-client
6
+ bs4
7
+ json5
8
+ fake-useragent
9
+ newspaper3k
10
+ google-api-python-client
11
+ faiss-cpu
12
+ tiktoken
13
+ langchain
14
+ python-dotenv
15
+ langchain-community
16
+ langchain-openai
17
+ pip-tools
18
+ gradio
requirements.txt ADDED
@@ -0,0 +1,508 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # This file is autogenerated by pip-compile with Python 3.11
3
+ # by the following command:
4
+ #
5
+ # pip-compile requirements.in
6
+ #
7
+ aiofiles==23.2.1
8
+ # via gradio
9
+ aiohttp==3.9.3
10
+ # via
11
+ # cohere
12
+ # langchain
13
+ # langchain-community
14
+ aiosignal==1.3.1
15
+ # via aiohttp
16
+ altair==5.2.0
17
+ # via gradio
18
+ annotated-types==0.6.0
19
+ # via pydantic
20
+ anyio==4.2.0
21
+ # via
22
+ # httpx
23
+ # langchain-core
24
+ # openai
25
+ # starlette
26
+ asttokens==2.4.1
27
+ # via stack-data
28
+ attrs==23.2.0
29
+ # via
30
+ # aiohttp
31
+ # jsonschema
32
+ # referencing
33
+ authlib==1.3.0
34
+ # via weaviate-client
35
+ backoff==2.2.1
36
+ # via cohere
37
+ beartype==0.17.2
38
+ # via zsvision
39
+ beautifulsoup4==4.12.3
40
+ # via
41
+ # bs4
42
+ # feedfinder2
43
+ # newspaper3k
44
+ bs4==0.0.2
45
+ # via -r requirements.in
46
+ build==1.0.3
47
+ # via pip-tools
48
+ cachetools==5.3.2
49
+ # via google-auth
50
+ certifi==2024.2.2
51
+ # via
52
+ # httpcore
53
+ # httpx
54
+ # requests
55
+ cffi==1.16.0
56
+ # via cryptography
57
+ charset-normalizer==3.3.2
58
+ # via requests
59
+ click==8.1.7
60
+ # via
61
+ # nltk
62
+ # pip-tools
63
+ # typer
64
+ # uvicorn
65
+ cohere==4.47
66
+ # via -r requirements.in
67
+ colorama==0.4.6
68
+ # via typer
69
+ contourpy==1.2.0
70
+ # via matplotlib
71
+ cryptography==42.0.2
72
+ # via authlib
73
+ cssselect==1.2.0
74
+ # via newspaper3k
75
+ cycler==0.12.1
76
+ # via matplotlib
77
+ dataclasses-json==0.6.4
78
+ # via
79
+ # langchain
80
+ # langchain-community
81
+ decorator==5.1.1
82
+ # via
83
+ # ipdb
84
+ # ipython
85
+ distro==1.9.0
86
+ # via openai
87
+ executing==2.0.1
88
+ # via stack-data
89
+ faiss-cpu==1.7.4
90
+ # via -r requirements.in
91
+ fake-useragent==1.4.0
92
+ # via -r requirements.in
93
+ fastapi==0.109.2
94
+ # via gradio
95
+ fastavro==1.9.4
96
+ # via cohere
97
+ feedfinder2==0.0.4
98
+ # via newspaper3k
99
+ feedparser==6.0.11
100
+ # via newspaper3k
101
+ ffmpy==0.3.2
102
+ # via gradio
103
+ filelock==3.13.1
104
+ # via
105
+ # huggingface-hub
106
+ # tldextract
107
+ fonttools==4.48.1
108
+ # via matplotlib
109
+ frozenlist==1.4.1
110
+ # via
111
+ # aiohttp
112
+ # aiosignal
113
+ fsspec==2024.2.0
114
+ # via
115
+ # gradio-client
116
+ # huggingface-hub
117
+ google-api-core==2.17.0
118
+ # via google-api-python-client
119
+ google-api-python-client==2.118.0
120
+ # via -r requirements.in
121
+ google-auth==2.27.0
122
+ # via
123
+ # google-api-core
124
+ # google-api-python-client
125
+ # google-auth-httplib2
126
+ google-auth-httplib2==0.2.0
127
+ # via google-api-python-client
128
+ googleapis-common-protos==1.62.0
129
+ # via google-api-core
130
+ gradio==4.19.0
131
+ # via -r requirements.in
132
+ gradio-client==0.10.0
133
+ # via gradio
134
+ grpcio==1.60.1
135
+ # via
136
+ # grpcio-health-checking
137
+ # grpcio-tools
138
+ # weaviate-client
139
+ grpcio-health-checking==1.60.1
140
+ # via weaviate-client
141
+ grpcio-tools==1.60.1
142
+ # via weaviate-client
143
+ h11==0.14.0
144
+ # via
145
+ # httpcore
146
+ # uvicorn
147
+ h5py==3.10.0
148
+ # via hickle
149
+ hickle==5.0.2
150
+ # via zsvision
151
+ httpcore==1.0.2
152
+ # via httpx
153
+ httplib2==0.22.0
154
+ # via
155
+ # google-api-python-client
156
+ # google-auth-httplib2
157
+ httpx==0.26.0
158
+ # via
159
+ # gradio
160
+ # gradio-client
161
+ # openai
162
+ # weaviate-client
163
+ huggingface-hub==0.20.3
164
+ # via
165
+ # gradio
166
+ # gradio-client
167
+ humanize==4.9.0
168
+ # via zsvision
169
+ idna==3.6
170
+ # via
171
+ # anyio
172
+ # httpx
173
+ # requests
174
+ # tldextract
175
+ # yarl
176
+ importlib-metadata==6.11.0
177
+ # via cohere
178
+ importlib-resources==6.1.1
179
+ # via gradio
180
+ ipdb==0.13.13
181
+ # via -r requirements.in
182
+ ipython==8.21.0
183
+ # via ipdb
184
+ jedi==0.19.1
185
+ # via ipython
186
+ jieba3k==0.35.1
187
+ # via newspaper3k
188
+ jinja2==3.1.3
189
+ # via
190
+ # altair
191
+ # gradio
192
+ joblib==1.3.2
193
+ # via nltk
194
+ json5==0.9.14
195
+ # via -r requirements.in
196
+ jsonpatch==1.33
197
+ # via
198
+ # langchain
199
+ # langchain-core
200
+ jsonpointer==2.4
201
+ # via jsonpatch
202
+ jsonschema==4.21.1
203
+ # via altair
204
+ jsonschema-specifications==2023.12.1
205
+ # via jsonschema
206
+ kiwisolver==1.4.5
207
+ # via matplotlib
208
+ langchain==0.1.7
209
+ # via -r requirements.in
210
+ langchain-community==0.0.20
211
+ # via
212
+ # -r requirements.in
213
+ # langchain
214
+ langchain-core==0.1.23
215
+ # via
216
+ # langchain
217
+ # langchain-community
218
+ # langchain-openai
219
+ langchain-openai==0.0.6
220
+ # via -r requirements.in
221
+ langsmith==0.0.87
222
+ # via
223
+ # langchain
224
+ # langchain-community
225
+ # langchain-core
226
+ lxml==5.1.0
227
+ # via newspaper3k
228
+ markdown-it-py==3.0.0
229
+ # via rich
230
+ markupsafe==2.1.5
231
+ # via
232
+ # gradio
233
+ # jinja2
234
+ marshmallow==3.20.2
235
+ # via dataclasses-json
236
+ matplotlib==3.8.2
237
+ # via
238
+ # gradio
239
+ # zsvision
240
+ matplotlib-inline==0.1.6
241
+ # via ipython
242
+ mdurl==0.1.2
243
+ # via markdown-it-py
244
+ mergedeep==1.3.4
245
+ # via zsvision
246
+ msgpack==1.0.7
247
+ # via
248
+ # msgpack-numpy
249
+ # zsvision
250
+ msgpack-numpy==0.4.8
251
+ # via zsvision
252
+ multidict==6.0.5
253
+ # via
254
+ # aiohttp
255
+ # yarl
256
+ mypy-extensions==1.0.0
257
+ # via typing-inspect
258
+ newspaper3k==0.2.8
259
+ # via -r requirements.in
260
+ nltk==3.8.1
261
+ # via newspaper3k
262
+ numpy==1.26.4
263
+ # via
264
+ # altair
265
+ # contourpy
266
+ # gradio
267
+ # h5py
268
+ # hickle
269
+ # langchain
270
+ # langchain-community
271
+ # langchain-openai
272
+ # matplotlib
273
+ # msgpack-numpy
274
+ # pandas
275
+ # scipy
276
+ # zsvision
277
+ openai==1.12.0
278
+ # via
279
+ # -r requirements.in
280
+ # langchain-openai
281
+ orjson==3.9.14
282
+ # via gradio
283
+ packaging==23.2
284
+ # via
285
+ # altair
286
+ # build
287
+ # gradio
288
+ # gradio-client
289
+ # huggingface-hub
290
+ # langchain-core
291
+ # marshmallow
292
+ # matplotlib
293
+ pandas==2.2.0
294
+ # via
295
+ # altair
296
+ # gradio
297
+ parso==0.8.3
298
+ # via jedi
299
+ pexpect==4.9.0
300
+ # via ipython
301
+ pillow==10.2.0
302
+ # via
303
+ # gradio
304
+ # matplotlib
305
+ # newspaper3k
306
+ pip-tools==7.3.0
307
+ # via -r requirements.in
308
+ prompt-toolkit==3.0.43
309
+ # via ipython
310
+ protobuf==4.25.2
311
+ # via
312
+ # google-api-core
313
+ # googleapis-common-protos
314
+ # grpcio-health-checking
315
+ # grpcio-tools
316
+ ptyprocess==0.7.0
317
+ # via pexpect
318
+ pure-eval==0.2.2
319
+ # via stack-data
320
+ pyasn1==0.5.1
321
+ # via
322
+ # pyasn1-modules
323
+ # rsa
324
+ pyasn1-modules==0.3.0
325
+ # via google-auth
326
+ pycparser==2.21
327
+ # via cffi
328
+ pydantic==2.6.1
329
+ # via
330
+ # fastapi
331
+ # gradio
332
+ # langchain
333
+ # langchain-core
334
+ # langsmith
335
+ # openai
336
+ # weaviate-client
337
+ pydantic-core==2.16.2
338
+ # via pydantic
339
+ pydub==0.25.1
340
+ # via gradio
341
+ pygments==2.17.2
342
+ # via
343
+ # ipython
344
+ # rich
345
+ pyparsing==3.1.1
346
+ # via
347
+ # httplib2
348
+ # matplotlib
349
+ pyproject-hooks==1.0.0
350
+ # via build
351
+ python-dateutil==2.8.2
352
+ # via
353
+ # matplotlib
354
+ # newspaper3k
355
+ # pandas
356
+ python-dotenv==1.0.1
357
+ # via -r requirements.in
358
+ python-multipart==0.0.9
359
+ # via gradio
360
+ pytz==2024.1
361
+ # via pandas
362
+ pyyaml==6.0.1
363
+ # via
364
+ # gradio
365
+ # huggingface-hub
366
+ # langchain
367
+ # langchain-community
368
+ # langchain-core
369
+ # newspaper3k
370
+ # zsvision
371
+ referencing==0.33.0
372
+ # via
373
+ # jsonschema
374
+ # jsonschema-specifications
375
+ regex==2023.12.25
376
+ # via
377
+ # nltk
378
+ # tiktoken
379
+ requests==2.31.0
380
+ # via
381
+ # cohere
382
+ # feedfinder2
383
+ # google-api-core
384
+ # huggingface-hub
385
+ # langchain
386
+ # langchain-community
387
+ # langchain-core
388
+ # langsmith
389
+ # newspaper3k
390
+ # requests-file
391
+ # tiktoken
392
+ # tldextract
393
+ # weaviate-client
394
+ requests-file==2.0.0
395
+ # via tldextract
396
+ rich==13.7.0
397
+ # via typer
398
+ rpds-py==0.18.0
399
+ # via
400
+ # jsonschema
401
+ # referencing
402
+ rsa==4.9
403
+ # via google-auth
404
+ ruff==0.2.1
405
+ # via gradio
406
+ scipy==1.12.0
407
+ # via zsvision
408
+ semantic-version==2.10.0
409
+ # via gradio
410
+ sgmllib3k==1.0.0
411
+ # via feedparser
412
+ shellingham==1.5.4
413
+ # via typer
414
+ six==1.16.0
415
+ # via
416
+ # asttokens
417
+ # feedfinder2
418
+ # python-dateutil
419
+ sniffio==1.3.0
420
+ # via
421
+ # anyio
422
+ # httpx
423
+ # openai
424
+ soupsieve==2.5
425
+ # via beautifulsoup4
426
+ sqlalchemy==2.0.27
427
+ # via
428
+ # langchain
429
+ # langchain-community
430
+ stack-data==0.6.3
431
+ # via ipython
432
+ starlette==0.36.3
433
+ # via fastapi
434
+ tenacity==8.2.3
435
+ # via
436
+ # langchain
437
+ # langchain-community
438
+ # langchain-core
439
+ tiktoken==0.6.0
440
+ # via
441
+ # -r requirements.in
442
+ # langchain-openai
443
+ tinysegmenter==0.3
444
+ # via newspaper3k
445
+ tldextract==5.1.1
446
+ # via newspaper3k
447
+ tomlkit==0.12.0
448
+ # via gradio
449
+ toolz==0.12.1
450
+ # via altair
451
+ tqdm==4.66.2
452
+ # via
453
+ # huggingface-hub
454
+ # nltk
455
+ # openai
456
+ traitlets==5.14.1
457
+ # via
458
+ # ipython
459
+ # matplotlib-inline
460
+ typeguard==4.1.5
461
+ # via zsvision
462
+ typer[all]==0.9.0
463
+ # via gradio
464
+ typing-extensions==4.9.0
465
+ # via
466
+ # fastapi
467
+ # gradio
468
+ # gradio-client
469
+ # huggingface-hub
470
+ # openai
471
+ # pydantic
472
+ # pydantic-core
473
+ # sqlalchemy
474
+ # typeguard
475
+ # typer
476
+ # typing-inspect
477
+ typing-inspect==0.9.0
478
+ # via dataclasses-json
479
+ tzdata==2024.1
480
+ # via pandas
481
+ uritemplate==4.1.1
482
+ # via google-api-python-client
483
+ urllib3==2.2.0
484
+ # via
485
+ # cohere
486
+ # requests
487
+ uvicorn==0.27.1
488
+ # via gradio
489
+ validators==0.22.0
490
+ # via weaviate-client
491
+ wcwidth==0.2.13
492
+ # via prompt-toolkit
493
+ weaviate-client==4.4.4
494
+ # via -r requirements.in
495
+ websockets==11.0.3
496
+ # via gradio-client
497
+ wheel==0.42.0
498
+ # via pip-tools
499
+ yarl==1.9.4
500
+ # via aiohttp
501
+ zipp==3.17.0
502
+ # via importlib-metadata
503
+ zsvision==0.7.12
504
+ # via -r requirements.in
505
+
506
+ # The following packages are considered to be unsafe in a requirements file:
507
+ # pip
508
+ # setuptools
run_pipeline.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from step1_api_claim_extractor import ClaimExtractor
2
+ from step2_api_fix_passage_anchors import FixAnchors
3
+ from step3_api_identify_objective_claims import ClassifyClaims
4
+ from step41_api_fetch_cohere_wikipedia_evidence import CohereEvidence
5
+ from step42_api_fetch_google_search_evidence import GoogleEvidence
6
+ from step5_api_embed_search_results import EmbedResults
7
+ from step6_api_claims_to_evidence import ClaimToEvidence
8
+ from step7_api_check_claims_against_evidence import CheckClaimAgainstEvidence
9
+ from step8_api_format_fact_checked_document import FormatDocument
10
+
11
+ import argparse
12
+ import json
13
+ import os
14
+ import copy
15
+ from dotenv import load_dotenv
16
+
17
+ load_dotenv()
18
+
19
+
20
+ def get_fact_checked(text_input, model="gpt-3.5-turbo", mode="slow"):
21
+ text_input = text_input.strip()
22
+
23
+ results = {}
24
+
25
+ # STEP1
26
+ print("Step1: Extracting claims")
27
+ step1 = ClaimExtractor(model=model)
28
+ step1_json = step1.extract_claims(text_input)
29
+ results["step1_claims"] = copy.deepcopy(step1_json)
30
+
31
+ # STEP2
32
+ print("Step2: Anchoring claims")
33
+ try:
34
+ step2 = FixAnchors(model=model)
35
+ step2_json = step2.fix_passage_anchors(step1_json, text_input)
36
+ except:
37
+ if model != "gpt-4":
38
+ print("Step2 failed with gpt-3.5, trying with gpt-4!")
39
+ step2 = FixAnchors(model="gpt-4")
40
+ step2_json = step2.fix_passage_anchors(step1_json, text_input)
41
+ results["step2_anchored_claims"] = copy.deepcopy(step2_json)
42
+
43
+ # STEP3
44
+ print("Step3: Classifying claims")
45
+ step3 = ClassifyClaims(model=model)
46
+ step3_json = step3.classify_claims(step2_json)
47
+ step3_filter = step3.filter_to_objective_claims(step3_json)
48
+ results["step3_classify_claims"] = copy.deepcopy(step3_json)
49
+ results["step3_objective_claims"] = copy.deepcopy(step3_filter)
50
+
51
+ if len(step3_filter) == 0:
52
+ return {"fact_checked_md": "No objective claims found!"}
53
+
54
+ # STEP4.1
55
+ print("Step4.1: Gathering evidence")
56
+ step4_cohere = CohereEvidence()
57
+ step4_json_cohere = (
58
+ step4_cohere.fetch_cohere_semantic_search_results_to_gather_evidence(
59
+ step3_filter
60
+ )
61
+ )
62
+ results["step41_cohere_evidence"] = copy.deepcopy(step4_json_cohere)
63
+
64
+ # STEP4.2
65
+ print("Step4.2: Gathering evidence")
66
+ step4_json_google = None
67
+ if mode == "slow":
68
+ step4_json_google = ""
69
+ try:
70
+ step4_google = GoogleEvidence(model=model)
71
+ step4_json_google = step4_google.fetch_search_results_to_gather_evidence(
72
+ step3_filter
73
+ )
74
+ except Exception as e:
75
+ print(f"Google search failed: {e}")
76
+ pass
77
+ results["step42_google_evidence"] = copy.deepcopy(step4_json_google)
78
+
79
+ embedding_model = "text-embedding-ada-002"
80
+ text_embedding_chunk_size = 500
81
+
82
+ srcs = [step4_json_cohere]
83
+ if step4_json_google:
84
+ srcs.append(step4_json_google)
85
+
86
+ # STEP 5
87
+ print("Step5: Embedding evidence")
88
+ step5 = EmbedResults(
89
+ embedding_model=embedding_model,
90
+ text_embedding_chunk_size=text_embedding_chunk_size,
91
+ )
92
+ faiss_db = step5.embed_for_uuid(srcs)
93
+
94
+ # STEP 6
95
+ print("Step6: Linking claims to evidence")
96
+ step6 = ClaimToEvidence()
97
+ step6_json = step6.link_claims_to_evidence(step3_filter, faiss_db)
98
+ results["step6_claim_to_evidence"] = copy.deepcopy(step6_json)
99
+
100
+ # STEP 7
101
+ print("Step7: Checking claims against evidence")
102
+ step7 = CheckClaimAgainstEvidence(model=model)
103
+ step7_json = step7.check_claims_against_evidence(step6_json)
104
+ results["step7_evaluated_claims"] = copy.deepcopy(step7_json)
105
+
106
+ # STEP 8
107
+ print("Step8: Formatting")
108
+ if mode == "slow":
109
+ step8 = FormatDocument(model=model, footnote_style="verbose")
110
+ step8_md = step8.reformat_document_to_include_claims(
111
+ text_input, step7_json, footnote_style="verbose"
112
+ )
113
+ step8_md_terse = step8.reformat_document_to_include_claims(
114
+ text_input, step7_json, footnote_style="terse"
115
+ )
116
+
117
+ results["fact_checked_md"] = copy.deepcopy(step8_md)
118
+ results["fact_checked_terse"] = copy.deepcopy(step8_md_terse)
119
+ return results
120
+
121
+
122
+ def main(args):
123
+ with open(args.file, "r") as f:
124
+ text = f.read()
125
+ out = get_fact_checked(text, mode="slow", model=args.model)
126
+ print(out["fact_checked_md"])
127
+
128
+
129
+ if __name__ == "__main__":
130
+ parser = argparse.ArgumentParser(description="Process a file.")
131
+ parser.add_argument("--file", type=str, help="File to process", required=True)
132
+ parser.add_argument("--model", type=str, help="Model to use", required=True)
133
+ args = parser.parse_args()
134
+ main(args)
step1_api_claim_extractor.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import multiprocessing as mp
4
+ from zsvision.zs_multiproc import starmap_with_kwargs
5
+ from zsvision.zs_utils import BlockTimer
6
+ from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim
7
+ from llm_api_utils import (
8
+ call_openai_with_exponetial_backoff,
9
+ estimate_cost_of_text_generation_api_call,
10
+ init_openai_with_api_key,
11
+ )
12
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
13
+
14
+
15
+ class ClaimExtractor:
16
+ def __init__(
17
+ self,
18
+ temperature=0,
19
+ model="gpt-3.5-turbo",
20
+ filter_str="",
21
+ processes=1,
22
+ refresh=False,
23
+ ):
24
+ """Initializes ClaimExtractor with the provided arguments"""
25
+ self.temperature = temperature
26
+ self.model = model
27
+ self.filter_str = filter_str
28
+ self.processes = processes
29
+ self.refresh = refresh
30
+
31
+ def extract_claims_from_passage(
32
+ self,
33
+ idx: int,
34
+ total: int,
35
+ passage: str,
36
+ ):
37
+ init_openai_with_api_key()
38
+ print(f"Processing passage {idx + 1} of {total}")
39
+ prompt = f"""\
40
+ Task:
41
+ Enumerate all the discrete factual claims or logical assertions stated in the passage that follows the dashed horizontal line below. \
42
+ To allow the claims to be linked to the passage, use the format: `VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: <verbatim passage quote for claim>, CLAIM: <claim>` on each line. \
43
+ The <verbatim passage quote for claim> must be A SINGLE UNEDITED SUBSTRING from the passage that uniquely identifies the claim. \
44
+ The <verbatim passage quote for claim> must carefully preserve all punctuation and clauses from the original passage. \
45
+ This text will be used in the final national exam.
46
+
47
+ ----------
48
+ Here is an example passage, together with the verbatim passage quotes and claims that should be extracted from it:
49
+
50
+ Passage:
51
+ Immanuel Kant was born in 1724 into a modest, devoutly religious family, with his father working as a saddle-maker. \
52
+ He was one of nine children, but only five, including Kant, survived to adulthood. \
53
+ His upbringing was steeped in the Pietist tradition, emphasizing intense religious devotion, a literal interpretation of the Bible, and a strong focus on personal morality. \
54
+ Kant attended the University of Königsberg, studying various subjects, including theology, metaphysics, and natural science. \
55
+ After completing his studies, Kant worked as a private tutor for nine years before returning to the University of Königsberg as a lecturer in 1755. \
56
+ In his works Groundwork of the Metaphysics of Morals (1785) and Critique of Practical Reason (1788), Kant argues that morality is not contingent upon personal desires or cultural norms. \
57
+
58
+
59
+ Extracted source phrases and claims:
60
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born in 1724.
61
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a modest family.
62
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a devoutly religious family.
63
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] with his father working as a saddle-maker [CLAIM] Immnauel Kant's father worked as a saddle-maker.
64
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] He was one of nine children [CLAIM] Immanuel Kant was one of nine children.
65
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] but only five, including Kant survived to adulthood [CLAIM] Only five of Immanuel Kant's parents' children survived to adulthood.
66
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] His upbringing was steeped in the Pietist tradition [CLAIM] Immanuel Kant's upbringing was steeped in the Pietist tradition.
67
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] emphasizing intense religious devotion [CLAIM] Immanuel Kant's upbringing emphasized intense religious devotion.
68
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a literal interpretation of the Bible [CLAIM] Immanuel Kant's upbringing emphasized a literal interpretation of the Bible.
69
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a strong focus on personal morality [CLAIM] Immanuel Kant's upbringing emphasized a strong focus on personal morality.
70
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Kant attended the University of Königsberg [CLAIM] Immanuel Kant attended the University of Königsberg.
71
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied theology.
72
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied metaphysics.
73
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied natural science.
74
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies [CLAIM] Immanuel Kant completed his studies.
75
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies, Kant worked as a private tutor for nine years [CLAIM] After completing his studies, Immanuel Kant worked as a private tutor.
76
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] before returning to the University of Königsberg as a lecturer in 1755 [CLAIM] Immanuel Kant returned to the University of Königsberg as a lecturer in 1755.
77
+
78
+ ----------
79
+ Passage:
80
+ {passage}
81
+
82
+ Extracted source phrases and claims:\
83
+ """
84
+ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
85
+ system_message = {"role": "system", "content": persona}
86
+ user_message = {"role": "user", "content": prompt}
87
+ messages = [system_message, user_message]
88
+
89
+ with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
90
+ response = call_openai_with_exponetial_backoff(
91
+ model=self.model,
92
+ temperature=self.temperature,
93
+ messages=messages,
94
+ )
95
+
96
+ cost = estimate_cost_of_text_generation_api_call(
97
+ model=self.model, response=response, verbose=True
98
+ )
99
+ content = response.choices[0].message.content
100
+ content = content.strip()
101
+ quotes_and_claims = content.split("\n")
102
+
103
+ parsed_claims = []
104
+ for quote_and_claim in quotes_and_claims:
105
+ quote_and_claim = quote_and_claim.strip()
106
+ if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim:
107
+ quote_and_claim = quote_and_claim.replace(
108
+ "VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: ",
109
+ "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]: ",
110
+ )
111
+ if "[CLAIM]" not in quote_and_claim:
112
+ quote_and_claim = quote_and_claim.replace(" CLAIM:", " [CLAIM]:")
113
+
114
+ if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim:
115
+ continue
116
+ quote_and_claim = quote_and_claim.strip()
117
+ parsed = parse_passage_quote_and_claim(quote_and_claim)
118
+ is_unique_and_verbatim = is_unique_verbatim_quote(
119
+ verbatim_quote=parsed["verbatim_quote"], original_passage=passage
120
+ )
121
+ parsed["is_unique_and_verbatim"] = is_unique_and_verbatim
122
+ parsed_claims.append(parsed)
123
+
124
+ return {"claims": parsed_claims, "cost": cost}
125
+
126
+ def extract_claims(self, text_input):
127
+ """
128
+ Extracts claims from text_input and return the extracted claims in a json file
129
+ """
130
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
131
+ docs = text_splitter.create_documents([text_input])
132
+ print(f"Split text into {len(docs)} documents")
133
+ all_claims = []
134
+
135
+ kwarg_list = []
136
+ for idx, doc in enumerate(docs):
137
+ # remove newlines from the passage to avoid a confusing prompt format
138
+ passage = doc.page_content.replace("\n", " ")
139
+ kwarg_list.append(
140
+ {
141
+ "idx": idx,
142
+ "total": len(docs),
143
+ "passage": passage,
144
+ }
145
+ )
146
+
147
+ if self.processes == 1:
148
+ results = []
149
+ for kwargs in kwarg_list:
150
+ results.append(self.extract_claims_from_passage(**kwargs))
151
+ else: # multiprocess
152
+ func = self.extract_claims_from_passage
153
+ with mp.Pool(processes=self.processes) as pool:
154
+ results = starmap_with_kwargs(
155
+ pool=pool, func=func, kwargs_iter=kwarg_list
156
+ )
157
+
158
+ cost = sum([result["cost"] for result in results])
159
+ all_claims = []
160
+ for result in results:
161
+ all_claims.extend(result["claims"])
162
+
163
+ print(f"Returning {len(all_claims)} claims (cost: {cost} USD)")
164
+ return all_claims
step2_api_fix_passage_anchors.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import multiprocessing as mp
4
+ from zsvision.zs_multiproc import starmap_with_kwargs
5
+ from zsvision.zs_utils import BlockTimer
6
+ from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim
7
+ from llm_api_utils import (
8
+ call_openai_with_exponetial_backoff,
9
+ estimate_cost_of_text_generation_api_call,
10
+ init_openai_with_api_key,
11
+ )
12
+
13
+
14
+ class FixAnchors:
15
+ def __init__(
16
+ self,
17
+ temperature=0,
18
+ model="gpt-3.5-turbo",
19
+ filter_str="",
20
+ processes=8,
21
+ refresh=False,
22
+ ):
23
+ self.temperature = temperature
24
+ self.model = model
25
+ self.filter_str = filter_str
26
+ self.processes = processes
27
+ self.refresh = refresh
28
+
29
+ def fix_passage_anchor(
30
+ self,
31
+ idx: int,
32
+ total: int,
33
+ original_passage: str,
34
+ claim_with_metadata: dict,
35
+ ):
36
+ init_openai_with_api_key()
37
+ print(f"Processing claim with metadata {idx + 1} of {total}")
38
+ # we remove newlines
39
+ original_passage = original_passage.replace("\n", " ")
40
+ assert not claim_with_metadata[
41
+ "is_unique_and_verbatim"
42
+ ], "We should only fix broken passage anchors"
43
+
44
+ prompt = f"""\
45
+ Task:
46
+ A co-worker was tasked with identifying a unique, verbatim quote from a passage that underpins a particular claim. \
47
+ Unfortunately, they made a mistake and the quote they identified is not unique and verbatim. \
48
+ Your task is to fix their quote so that it is both verbatim and unique.
49
+
50
+ -----
51
+ Here is an example passage, together with the claim and the erroneous quote.
52
+
53
+ Passage:
54
+ In 1940, she was interned in a French camp as an enemy alien, but managed to escape and eventually make her way to the United States in 1941. \
55
+ Arendt's experiences during this time would deeply influence her work on totalitarianism and human rights. \
56
+ In New York, she began to immerse herself in academic life, working as an editor, journalist, and lecturer. \
57
+ Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy. \
58
+ ## A Life Of Controversial, Influential Works \
59
+ Throughout her career, Arendt wrote a number of seminal, and controversial, works. *The Human Condition* (1958) examined the role of politics in modern societies and introduced the concept of "the public realm" – the space where individuals act and participate in political life. \
60
+ This exploration of freedom and action would become a recurring theme in her writings. \
61
+ Her 1963 publication, *Eichmann in Jerusalem: A Report on the Banality of Evil*, based on her coverage of Adolf Eichmann's trial, ignited significant controversy. \
62
+ Arendt argued that Eichmann, a key architect of the Holocaust, was not a monster but rather an ordinary bureaucrat who unquestioningly followed orders. \
63
+ The idea of the "banality of evil" continues to influence discussions on the nature of evil and moral responsibility. \
64
+ Arendt's later works, such as *On Revolution* (1963) and *Between Past and Future* (1968), sought to further unravel the complexities of power, authority, and rebellion. \
65
+ Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work). \
66
+
67
+ Claim:
68
+ *The Origins of Totalitarianism* established Arendt as a significant voice in political philosophy.
69
+
70
+ Initial attempt at a unique and verbatim quote:
71
+ [The Origins of Totalitarianism] established her as a significant voice in political philosophy.
72
+
73
+ Correct (unique and verbatim) quote:
74
+ Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy.
75
+ -----
76
+ Passage:
77
+ {original_passage}
78
+
79
+ Claim:
80
+ {claim_with_metadata["claim"]}
81
+
82
+ Initial attempt at a unique verbatim quote:
83
+ {claim_with_metadata["verbatim_quote"]}
84
+
85
+ Correct (unique and verbatim) quote:\
86
+ """
87
+ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
88
+ system_message = {"role": "system", "content": persona}
89
+ user_message = {"role": "user", "content": prompt}
90
+ messages = [system_message, user_message]
91
+
92
+ with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
93
+ response = call_openai_with_exponetial_backoff(
94
+ model=self.model,
95
+ temperature=self.temperature,
96
+ messages=messages,
97
+ )
98
+
99
+ cost = estimate_cost_of_text_generation_api_call(
100
+ model=self.model, response=response, verbose=True
101
+ )
102
+ content = response.choices[0].message.content
103
+ verbatim_quote = content.rstrip()
104
+ is_unique_and_verbatim = is_unique_verbatim_quote(
105
+ verbatim_quote=verbatim_quote, original_passage=original_passage
106
+ )
107
+ assert (
108
+ is_unique_and_verbatim
109
+ ), f"Failed to fix passage anchor: {claim_with_metadata['verbatim_quote']} was updated to {verbatim_quote} but is not unique and verbatim"
110
+
111
+ claim_with_metadata["verbatim_quote"] = verbatim_quote
112
+ return {"claim_with_metadata": claim_with_metadata, "cost": cost}
113
+
114
+ def fix_passage_anchors(self, claims_with_metadata, original_passage: str):
115
+ kwarg_list = []
116
+ valid_claims_with_metadata = []
117
+ invalid_claims_with_metadata = []
118
+ for idx, claim_with_metadata in enumerate(claims_with_metadata):
119
+ # remove newlines from the passage to avoid a confusing prompt format
120
+ if not claim_with_metadata["is_unique_and_verbatim"]:
121
+ invalid_claims_with_metadata.append(claim_with_metadata)
122
+ else:
123
+ valid_claims_with_metadata.append(claim_with_metadata)
124
+
125
+ for idx, claim_with_metadata in enumerate(invalid_claims_with_metadata):
126
+ kwarg_list.append(
127
+ {
128
+ "idx": idx,
129
+ "total": len(invalid_claims_with_metadata),
130
+ "claim_with_metadata": claim_with_metadata,
131
+ "original_passage": original_passage,
132
+ }
133
+ )
134
+
135
+ if self.processes == 1:
136
+ results = []
137
+ for kwargs in kwarg_list:
138
+ try:
139
+ results.append(self.fix_passage_anchor(**kwargs))
140
+ except Exception as e:
141
+ print(f"Exception in step2: {e}, model: {self.model}")
142
+ print("Skipping this claim!")
143
+ if self.model == "gpt-4":
144
+ pass
145
+ else:
146
+ raise e
147
+ else: # multiprocess
148
+ func = self.fix_passage_anchor
149
+ with mp.Pool(processes=self.processes) as pool:
150
+ results = starmap_with_kwargs(
151
+ pool=pool, func=func, kwargs_iter=kwarg_list
152
+ )
153
+
154
+ cost = sum([result["cost"] for result in results])
155
+ for result in results:
156
+ valid_claims_with_metadata.append(result["claim_with_metadata"])
157
+
158
+ # remove the is_unique_and_verbatim field (no longer needed)
159
+ for claim_with_metadata in valid_claims_with_metadata:
160
+ del claim_with_metadata["is_unique_and_verbatim"]
161
+
162
+ print(
163
+ f"Returning {len(valid_claims_with_metadata)} claims with metadat (cost: {cost} USD)"
164
+ )
165
+ return valid_claims_with_metadata
step3_api_identify_objective_claims.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ import multiprocessing as mp
4
+ from zsvision.zs_multiproc import starmap_with_kwargs
5
+ from typing import List, Dict
6
+ import numpy as np
7
+ from zsvision.zs_utils import BlockTimer
8
+ from llm_api_utils import (
9
+ call_openai_with_exponetial_backoff,
10
+ estimate_cost_of_text_generation_api_call,
11
+ init_openai_with_api_key,
12
+ )
13
+ import random
14
+
15
+
16
+ class ClassifyClaims:
17
+ def __init__(
18
+ self,
19
+ temperature=0,
20
+ model="gpt-3.5-turbo",
21
+ max_claims_per_api_call=10,
22
+ processes=8,
23
+ filter_str="",
24
+ refresh=False,
25
+ ):
26
+ self.temperature = temperature
27
+ self.model = model
28
+ self.max_claims_per_api_call = max_claims_per_api_call
29
+ self.processes = processes
30
+ self.filter_str = filter_str
31
+ self.refresh = refresh
32
+ self.objective_claims_file = "objective_claims.txt"
33
+ self.subjective_claims_file = "subjective_claims.txt"
34
+
35
+ def parse_classification_label(self, text: str) -> str:
36
+ raw = text.strip()
37
+ if raw.endswith("[objective]"):
38
+ label = "objective"
39
+ elif raw.endswith("[subjective]"):
40
+ label = "subjective"
41
+ else:
42
+ raise ValueError(f"Invalid label: {raw}")
43
+ return label
44
+
45
+ def read_file(self, file_name):
46
+ with open(file_name, "r") as f:
47
+ lines = []
48
+ for line in f:
49
+ parsed_line = line.strip()
50
+ lines.append(parsed_line)
51
+ return lines
52
+
53
+ def create_few_shot_learning_prompt(self) -> str:
54
+ objective_list = self.read_file(self.objective_claims_file)
55
+ subjective_list = self.read_file(self.subjective_claims_file)
56
+ merged_list = list(
57
+ zip(objective_list, ["[objective]"] * len(objective_list))
58
+ ) + list(zip(subjective_list, ["[subjective]"] * len(subjective_list)))
59
+
60
+ # Randomizing the merged list with a specific seed
61
+ seed = 1234
62
+ random.seed(seed)
63
+ random.shuffle(merged_list)
64
+ prompt = "Claims:\n"
65
+ for claim, _ in merged_list:
66
+ prompt += claim + "\n"
67
+ prompt += "\nClassifications:\n"
68
+ for claim, classif in merged_list:
69
+ prompt += claim + " " + classif + "\n"
70
+ return prompt
71
+
72
+ def classify_claim_batch(
73
+ self,
74
+ idx: int,
75
+ total: int,
76
+ claims_and_sources_batch: List[Dict[str, str]],
77
+ ):
78
+ print(
79
+ f"Processing batch {idx+1} of {total} (containing {len(claims_and_sources_batch)} claims)"
80
+ )
81
+
82
+ claim_str = "\n".join([claim["claim"] for claim in claims_and_sources_batch])
83
+ num_batch_claims = len(claims_and_sources_batch)
84
+ few_shot = self.create_few_shot_learning_prompt()
85
+ prompt = f"""\
86
+ Objective claims can be verified based on factual data (such as those that could be verified by \
87
+ referencing an encyclopedia), whereas subjective claims involve a personal interpretation of \
88
+ the data and are more open to debate. \
89
+ For each of the following claims given below the dashed horizontal line, classify them as \
90
+ [subjective] or [objective] by suffixing the claim with the appropriate label. OUTPUT ONLY the class, either subjective or objective for each claim!
91
+
92
+ Here are some examples:
93
+
94
+ {few_shot}
95
+ ----------
96
+ Claims:
97
+ {claim_str}
98
+
99
+ Classifications:\
100
+ """
101
+ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
102
+ system_message = {"role": "system", "content": persona}
103
+ user_message = {"role": "user", "content": prompt}
104
+ messages = [system_message, user_message]
105
+
106
+ with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"):
107
+ response = call_openai_with_exponetial_backoff(
108
+ model=self.model,
109
+ temperature=self.temperature,
110
+ messages=messages,
111
+ )
112
+
113
+ cost = estimate_cost_of_text_generation_api_call(
114
+ model=self.model, response=response, verbose=True
115
+ )
116
+
117
+ proposed_classified_claims = response.choices[0].message.content
118
+ batch_classified_claims = proposed_classified_claims.split("\n")
119
+
120
+ content = response.choices[0].message.content
121
+ batch_classified_claims = content.split("\n")
122
+ assert (
123
+ len(batch_classified_claims) == num_batch_claims
124
+ ), f"Expected {num_batch_claims} claims, but got {len(batch_classified_claims)}"
125
+ print(f"Generated {len(batch_classified_claims)} claims (cost: {cost:.4f} USD)")
126
+
127
+ claims_with_labels = []
128
+ for claim_and_source, classified_claim in zip(
129
+ claims_and_sources_batch, batch_classified_claims
130
+ ):
131
+ claim_label = self.parse_classification_label(classified_claim)
132
+ claim_and_source["label"] = claim_label
133
+ claims_with_labels.append(claim_and_source)
134
+ return {"claims_with_labels": claims_with_labels, "cost": cost}
135
+
136
+ def classify_claims(self, claims_and_sources):
137
+ """
138
+ Classify claims as being either subjective or objective, and write the results to a file.
139
+ """
140
+ init_openai_with_api_key()
141
+ num_claims = len(claims_and_sources)
142
+
143
+ # we limit the number of claims per api call (otherwise GPT-4 can choke)
144
+ num_batches = int(np.ceil(num_claims / self.max_claims_per_api_call))
145
+ claims_and_sources_batches = [
146
+ batch.tolist() for batch in np.array_split(claims_and_sources, num_batches)
147
+ ]
148
+
149
+ kwarg_list = []
150
+ for idx, claims_and_sources_batch in enumerate(claims_and_sources_batches):
151
+ # remove newlines from the passage to avoid a confusing prompt format
152
+ kwarg_list.append(
153
+ {
154
+ "idx": idx,
155
+ "total": len(claims_and_sources_batches),
156
+ "claims_and_sources_batch": claims_and_sources_batch,
157
+ }
158
+ )
159
+
160
+ if self.processes == 1:
161
+ batch_results = []
162
+ for kwargs in kwarg_list:
163
+ batch_results.append(self.classify_claim_batch(**kwargs))
164
+ else: # multiprocess
165
+ func = self.classify_claim_batch
166
+ with mp.Pool(processes=self.processes) as pool:
167
+ batch_results = starmap_with_kwargs(
168
+ pool=pool, func=func, kwargs_iter=kwarg_list
169
+ )
170
+
171
+ cost = sum([result["cost"] for result in batch_results])
172
+ labelled_claims = []
173
+ for batch in batch_results:
174
+ labelled_claims.extend(batch["claims_with_labels"])
175
+
176
+ print(f"Returning {len(labelled_claims)} claims (cost: {cost} USD)")
177
+ return labelled_claims
178
+
179
+ def filter_to_objective_claims(self, claims):
180
+ """Filter claims to only those that are objective."""
181
+
182
+ objective_claims = [claim for claim in claims if claim["label"] == "objective"]
183
+
184
+ print(f"Returning {len(objective_claims)} objective claims")
185
+ return objective_claims
step41_api_fetch_cohere_wikipedia_evidence.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import multiprocessing as mp
4
+ from datetime import datetime
5
+ import time
6
+ from zsvision.zs_multiproc import starmap_with_kwargs
7
+ import weaviate
8
+ import os
9
+
10
+
11
+ class CohereEvidence:
12
+ def __init__(self, processes=8, filter_str="", refresh=False):
13
+ self.processes = processes
14
+ self.filter_str = filter_str
15
+ self.refresh = refresh
16
+
17
+ def semantic_search(self, query, client, results_lang=""):
18
+ """
19
+ Query the vectors database and return the top results.
20
+ """
21
+
22
+ nearText = {"concepts": [query]}
23
+ properties = ["text", "title", "url", "views", "lang", "_additional {distance}"]
24
+
25
+ # To filter by language
26
+ if results_lang != "":
27
+ where_filter = {
28
+ "path": ["lang"],
29
+ "operator": "Equal",
30
+ "valueString": results_lang,
31
+ }
32
+ response = (
33
+ client.query.get("Articles", properties)
34
+ .with_where(where_filter)
35
+ .with_near_text(nearText)
36
+ .with_limit(5)
37
+ .do()
38
+ )
39
+
40
+ # Search all languages
41
+ else:
42
+ response = (
43
+ client.query.get("Articles", properties)
44
+ .with_near_text(nearText)
45
+ .with_limit(5)
46
+ .do()
47
+ )
48
+
49
+ result = response["data"]["Get"]["Articles"]
50
+
51
+ return result
52
+
53
+ def fetch_cohere_semantic_search_results_to_gather_evidence(
54
+ self,
55
+ queryset: dict,
56
+ ):
57
+ """
58
+ Generate a search query that can be used to verify a claim.
59
+ """
60
+ # 10M wiki embeddings (1M in English)
61
+ weaviate_api_key = os.environ.get("WEAVIATE_API_KEY")
62
+
63
+ cohere_api_key = os.environ.get("COHERE_API_KEY")
64
+
65
+ client = weaviate.Client(
66
+ url="https://cohere-demo.weaviate.network/",
67
+ auth_client_secret=weaviate.auth.AuthApiKey(
68
+ api_key=weaviate_api_key
69
+ ), # Replace w/ your Weaviate instance API key
70
+ additional_headers={
71
+ "X-Cohere-Api-Key": cohere_api_key # Replace with your inference API key
72
+ },
73
+ )
74
+
75
+ while not client.is_ready():
76
+ print(f"Waiting for client to be ready")
77
+ time.sleep(1)
78
+
79
+ for item in queryset:
80
+ results = self.semantic_search(
81
+ item["claim"], client=client, results_lang="en"
82
+ )
83
+ # rename "url" to "link" to be consistent with google results
84
+ reformatted_results = []
85
+ for result in results:
86
+ result["link"] = result.pop("url")
87
+ reformatted_results.append(result)
88
+ item["search_results"] = reformatted_results
89
+
90
+ # update the queryset with new information
91
+ date_str = datetime.now().strftime("%Y-%m-%d")
92
+ results = {
93
+ "documents": queryset,
94
+ "dates": {"results_fetched_from_wikipedia_1M_with_cohere-22-12": date_str},
95
+ }
96
+ print(f"Returning Cohere Wikipedia paragraph for {len(queryset)} queries")
97
+ return results
step42_api_fetch_google_search_evidence.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from zsvision.zs_utils import BlockTimer
4
+ import json
5
+ import json5
6
+ import argparse
7
+ import multiprocessing as mp
8
+ from zsvision.zs_multiproc import starmap_with_kwargs
9
+ from datetime import datetime
10
+ import urllib.robotparser
11
+ import urllib.parse
12
+ from urllib.parse import urlunparse
13
+ from utils import get_google_search_results
14
+
15
+ import time
16
+ from random import randint
17
+ from fake_useragent import UserAgent
18
+ from newspaper import Article, Config
19
+
20
+
21
+ class GoogleEvidence:
22
+ def __init__(
23
+ self,
24
+ model="gpt-3.5-turbo",
25
+ limit=0,
26
+ refresh=False,
27
+ num_search_results_to_keep=3,
28
+ filter_str="",
29
+ processes=8,
30
+ ):
31
+ self.model = model
32
+ self.limit = limit
33
+ self.refresh = refresh
34
+ self.num_search_results_to_keep = num_search_results_to_keep
35
+ self.filter_str = filter_str
36
+ self.processes = processes
37
+
38
+ def can_index(self, url, user_agent_name):
39
+ rp = urllib.robotparser.RobotFileParser()
40
+ robots_url = f"{url.scheme}://{url.netloc}/robots.txt"
41
+
42
+ headers = {
43
+ "User-Agent": user_agent_name,
44
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
45
+ "Accept-Language": "en-US,en;q=0.5",
46
+ "DNT": "1",
47
+ "Connection": "keep-alive",
48
+ "Upgrade-Insecure-Requests": "1",
49
+ }
50
+
51
+ try:
52
+ req = urllib.request.Request(robots_url, headers=headers)
53
+ with urllib.request.urlopen(req) as response:
54
+ rp.parse(response.read().decode("utf-8").splitlines())
55
+
56
+ ok_to_index = rp.can_fetch(user_agent_name, url.geturl())
57
+ except urllib.error.URLError:
58
+ # If there is no robots.txt or there is an error accessing it, assume it's okay to index
59
+ ok_to_index = True
60
+ except Exception as e:
61
+ print(f"An unexpected error occurred in step42: {e}")
62
+ # going the safe route
63
+ ok_to_index = False
64
+ return ok_to_index
65
+
66
+ def fetch_search_results_to_gather_evidence(
67
+ self,
68
+ queryset: dict,
69
+ ):
70
+ user_agent = UserAgent()
71
+ config = Config()
72
+ config.fetch_images = False
73
+
74
+ user_agent_name = "FiltirBot/1.0 (+https://filtir.com/filtirbot-info)"
75
+
76
+ headers = {
77
+ "User-Agent": user_agent_name,
78
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
79
+ "Accept-Language": "en-US,en;q=0.5",
80
+ "DNT": "1",
81
+ "Connection": "keep-alive",
82
+ "Upgrade-Insecure-Requests": "1",
83
+ }
84
+
85
+ # we assume some sites won't permit indexing, so we'll skip these
86
+ num_results = self.num_search_results_to_keep + 5
87
+ results = {}
88
+
89
+ print(f"Found {len(queryset)} claims to fetch search results for")
90
+
91
+ for queryset_idx, item in enumerate(queryset):
92
+ with BlockTimer(
93
+ f"Fetching search results from Google {queryset_idx + 1}/{len(queryset)}"
94
+ ):
95
+ search_results = get_google_search_results(
96
+ query_str=item["claim"], num_results=num_results
97
+ )
98
+
99
+ if search_results == [{"Result": "No good Google Search Result was found"}]:
100
+ item["search_results"] = []
101
+ continue
102
+
103
+ parsed_results = []
104
+ for search_result in search_results:
105
+ if not self.can_index(
106
+ urllib.parse.urlparse(search_result["link"]),
107
+ user_agent_name=user_agent_name,
108
+ ):
109
+ print(
110
+ f"Skipping {search_result['link']} because it doesn't permit indexing"
111
+ )
112
+ continue
113
+ try:
114
+ config.browser_user_agent = user_agent.random
115
+ article = Article(
116
+ search_result["link"], language="en", config=config
117
+ )
118
+ article.download()
119
+ article.parse()
120
+ text = article.text
121
+ except Exception as e:
122
+ print(f"Error parsing article: {e}, trying with requests.get...")
123
+ try:
124
+ response = requests.get(
125
+ search_result["link"], timeout=15, headers=headers
126
+ )
127
+ html = response.text
128
+ soup = BeautifulSoup(html, features="html.parser")
129
+ text = soup.get_text()
130
+ except Exception as exception:
131
+ print(f"Error parsing article: {exception}, skipping")
132
+ continue
133
+
134
+ search_result["text"] = text
135
+ parsed_results.append(search_result)
136
+ if len(parsed_results) == self.num_search_results_to_keep:
137
+ break
138
+ item["search_results"] = parsed_results
139
+
140
+ # update the queryset with new information
141
+ date_str = datetime.now().strftime("%Y-%m-%d")
142
+ results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}
143
+
144
+ print(f"Returning web pages for search results for {len(queryset)} queries")
145
+ return results
step5_api_embed_search_results.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import shutil
3
+ from beartype import beartype
4
+ import numpy as np
5
+ import json
6
+ import argparse
7
+ from zsvision.zs_utils import BlockTimer
8
+ import tiktoken
9
+ from pathlib import Path
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from pipeline_paths import PIPELINE_PATHS
12
+ from llm_api_utils import (
13
+ init_openai_with_api_key,
14
+ EMBEDDING_DIMENSIONS,
15
+ PRICE_PER_1K_TOKENS,
16
+ )
17
+ from langchain_community.vectorstores import FAISS
18
+ from langchain_openai import OpenAIEmbeddings
19
+ from langchain.docstore.in_memory import InMemoryDocstore
20
+
21
+
22
+ class EmbedResults:
23
+ def __init__(
24
+ self,
25
+ embedding_model="ada",
26
+ limit=0,
27
+ refresh=False,
28
+ refresh_faiss_db=False,
29
+ text_embedding_chunk_size=500,
30
+ filter_str="",
31
+ ):
32
+ self.embedding_model = embedding_model
33
+ self.limit = limit
34
+ self.refresh = refresh
35
+ self.refresh_faiss_db = refresh_faiss_db
36
+ self.text_embedding_chunk_size = text_embedding_chunk_size
37
+ self.filter_str = filter_str
38
+
39
+ @beartype
40
+ def compute_embeddings_from_chunks(
41
+ self, embedding_function: OpenAIEmbeddings, metadatas: list, faiss_db
42
+ ):
43
+ doc_chunks = []
44
+ metadatas_without_chunks = []
45
+ for metadata in metadatas:
46
+ doc_chunk = metadata.pop("doc_chunk")
47
+ doc_chunks.append(doc_chunk)
48
+ metadatas_without_chunks.append(metadata)
49
+
50
+ with BlockTimer(f"Embedding {len(metadatas)} fragments"):
51
+ embeddings = embedding_function.embed_documents(doc_chunks)
52
+ # account for name mangling in Python
53
+ faiss_db._FAISS__add(doc_chunks, embeddings, metadatas_without_chunks)
54
+
55
+ return faiss_db
56
+
57
+ @beartype
58
+ def parse_date_of_fetching(self, data: dict) -> str:
59
+ evidence_keys = {
60
+ "search_results_fetched",
61
+ "results_fetched_from_wikipedia_1M_with_cohere-22-12",
62
+ }
63
+ for key in evidence_keys:
64
+ if key in data["dates"]:
65
+ evidence_fetched_date = data["dates"][key]
66
+ return evidence_fetched_date
67
+ raise ValueError(f"Could not find evidence fetched date in {data['dates']}")
68
+
69
+ def embed_for_uuid(self, srcs):
70
+ init_openai_with_api_key()
71
+
72
+ embedding_function = OpenAIEmbeddings()
73
+
74
+ index = faiss.IndexFlatL2(EMBEDDING_DIMENSIONS[self.embedding_model])
75
+ docstore = InMemoryDocstore({})
76
+ index_to_docstore_id = {}
77
+ faiss_db = FAISS(
78
+ embedding_function=embedding_function.embed_query,
79
+ index=index,
80
+ docstore=docstore,
81
+ index_to_docstore_id=index_to_docstore_id,
82
+ )
83
+
84
+ already_embedded_chunks = {
85
+ doc.metadata["chunk_tag"] for doc in faiss_db.docstore._dict.values()
86
+ }
87
+
88
+ splitter = RecursiveCharacterTextSplitter(
89
+ chunk_size=self.text_embedding_chunk_size,
90
+ chunk_overlap=0,
91
+ )
92
+
93
+ kwarg_list = []
94
+ seen_links = set()
95
+ metadatas = []
96
+ total_chunks = 0
97
+ chunks_to_embed = 0
98
+ chunks_to_skip = 0
99
+
100
+ for data in srcs:
101
+ evidence_fetched_date = self.parse_date_of_fetching(data)
102
+
103
+ for document in data["documents"]:
104
+ for search_result in document["search_results"]:
105
+ # Don't embed the same link twice
106
+ if search_result["link"] in seen_links:
107
+ continue
108
+ seen_links.add(search_result["link"])
109
+
110
+ doc_chunks = [
111
+ doc.page_content
112
+ for doc in splitter.create_documents([search_result["text"]])
113
+ ]
114
+ chunk_tags = [
115
+ f"{search_result['link']}-chunk-{idx}-chunk_sz-{self.text_embedding_chunk_size}"
116
+ for idx in range(len(doc_chunks))
117
+ ]
118
+ for doc_chunk, chunk_tag in zip(doc_chunks, chunk_tags):
119
+ if chunk_tag not in already_embedded_chunks:
120
+ metadatas.append(
121
+ {
122
+ "doc_chunk": doc_chunk,
123
+ "link": search_result["link"],
124
+ "chunk_tag": chunk_tag,
125
+ "date_accessed": evidence_fetched_date,
126
+ "query": document["claim"],
127
+ }
128
+ )
129
+ chunks_to_embed += 1
130
+ else:
131
+ chunks_to_skip += 1
132
+ total_chunks += len(doc_chunks)
133
+
134
+ encoding = tiktoken.encoding_for_model(self.embedding_model)
135
+ doc_chunks = [x["doc_chunk"] for x in metadatas]
136
+ num_words = len(" ".join(doc_chunks).split())
137
+ num_tokens = len(encoding.encode("".join(doc_chunks)))
138
+
139
+ print(
140
+ f"Created {total_chunks} chunks of text to answer from {len(seen_links)} websites"
141
+ )
142
+ print(
143
+ f"Embedding {chunks_to_embed} (skipping {chunks_to_skip}) chunks of text from {len(seen_links)} websites)"
144
+ )
145
+ print(
146
+ f"Embedding {num_tokens} tokens ({num_words} words) from {len(doc_chunks)} chunks"
147
+ )
148
+ print(
149
+ f"Step5: Estimated cost: {num_tokens * PRICE_PER_1K_TOKENS[self.embedding_model]['embed'] / 1000:.2f} USD"
150
+ )
151
+
152
+ if metadatas:
153
+ self.compute_embeddings_from_chunks(
154
+ embedding_function=embedding_function,
155
+ faiss_db=faiss_db,
156
+ metadatas=metadatas,
157
+ )
158
+
159
+ return faiss_db
160
+ return None
161
+
162
+ def embed(self):
163
+ init_openai_with_api_key()
164
+ src_paths = []
165
+ for evidence_key in (
166
+ "google_search_results_evidence",
167
+ "cohere_wikipedia_evidence",
168
+ ):
169
+ evidence_paths = list(PIPELINE_PATHS[evidence_key].glob("**/*.json"))
170
+ src_paths.extend(evidence_paths)
171
+
172
+ if self.filter_str:
173
+ num_paths = len(src_paths)
174
+ src_paths = [
175
+ src_path for src_path in src_paths if self.filter_str in src_path.name
176
+ ]
177
+ print(
178
+ f"Filtering for {self.filter_str} (from {num_paths} to {len(src_paths)})"
179
+ )
180
+
181
+ print(f"Found {len(src_paths)} collections of evidence")
182
+ src_paths = sorted(src_paths)
183
+
184
+ embedding_function = OpenAIEmbeddings()
185
+ faiss_persist_dir = (
186
+ PIPELINE_PATHS["faiss_db_embeddings_for_evidence"]
187
+ / f"{self.embedding_model}_chunk_size_{self.text_embedding_chunk_size}"
188
+ )
189
+
190
+ if faiss_persist_dir.exists():
191
+ if self.refresh_faiss_db:
192
+ print(f"Deleting existing database at {faiss_persist_dir}")
193
+ shutil.rmtree(faiss_persist_dir)
194
+
195
+ # check which chunks we've already embedded to avoid duplication
196
+ if faiss_persist_dir.exists() and not self.refresh_faiss_db:
197
+ faiss_db = FAISS.load_local(
198
+ folder_path=str(faiss_persist_dir), embeddings=embedding_function
199
+ )
200
+ print(f"Found existing database at {faiss_persist_dir}, using... ")
201
+ else:
202
+ index = faiss.IndexFlatL2(EMBEDDING_DIMENSIONS[self.embedding_model])
203
+ docstore = InMemoryDocstore({})
204
+ index_to_docstore_id = {}
205
+ faiss_db = FAISS(
206
+ embedding_function=embedding_function.embed_query,
207
+ index=index,
208
+ docstore=docstore,
209
+ index_to_docstore_id=index_to_docstore_id,
210
+ )
211
+ print(f"Persisting intialised database to {faiss_persist_dir}")
212
+ faiss_db.save_local(folder_path=str(faiss_persist_dir))
213
+
214
+ already_embedded_chunks = {
215
+ doc.metadata["chunk_tag"] for doc in faiss_db.docstore._dict.values()
216
+ }
217
+
218
+ splitter = RecursiveCharacterTextSplitter(
219
+ chunk_size=self.text_embedding_chunk_size,
220
+ chunk_overlap=0,
221
+ )
222
+
223
+ kwarg_list = []
224
+ seen_links = set()
225
+ metadatas = []
226
+ total_chunks = 0
227
+ chunks_to_embed = 0
228
+ chunks_to_skip = 0
229
+
230
+ for src_path in src_paths:
231
+ with open(src_path, "r") as f:
232
+ data = json.load(f)
233
+
234
+ evidence_fetched_date = self.parse_date_of_fetching(data)
235
+
236
+ for document in data["documents"]:
237
+ for search_result in document["search_results"]:
238
+ # Don't embed the same link twice
239
+ if search_result["link"] in seen_links:
240
+ continue
241
+ seen_links.add(search_result["link"])
242
+
243
+ doc_chunks = [
244
+ doc.page_content
245
+ for doc in splitter.create_documents([search_result["text"]])
246
+ ]
247
+ chunk_tags = [
248
+ f"{search_result['link']}-chunk-{idx}-chunk_sz-{self.text_embedding_chunk_size}"
249
+ for idx in range(len(doc_chunks))
250
+ ]
251
+ for doc_chunk, chunk_tag in zip(doc_chunks, chunk_tags):
252
+ if chunk_tag not in already_embedded_chunks:
253
+ metadatas.append(
254
+ {
255
+ "doc_chunk": doc_chunk,
256
+ "link": search_result["link"],
257
+ "chunk_tag": chunk_tag,
258
+ "date_accessed": evidence_fetched_date,
259
+ "query": document["claim"],
260
+ }
261
+ )
262
+ chunks_to_embed += 1
263
+ else:
264
+ chunks_to_skip += 1
265
+ total_chunks += len(doc_chunks)
266
+
267
+ encoding = tiktoken.encoding_for_model(self.embedding_model)
268
+ doc_chunks = [x["doc_chunk"] for x in metadatas]
269
+ num_words = len(" ".join(doc_chunks).split())
270
+ num_tokens = len(encoding.encode("".join(doc_chunks)))
271
+
272
+ print(
273
+ f"Created {total_chunks} chunks of text to answer from {len(seen_links)} websites"
274
+ )
275
+ print(
276
+ f"Embedding {chunks_to_embed} (skipping {chunks_to_skip}) chunks of text from {len(seen_links)} websites)"
277
+ )
278
+ print(
279
+ f"Embedding {num_tokens} tokens ({num_words} words) from {len(doc_chunks)} chunks"
280
+ )
281
+ print(
282
+ f"Estimated cost: {num_tokens * PRICE_PER_1K_TOKENS[self.embedding_model]['embed'] / 1000:.2f} USD"
283
+ )
284
+
285
+ if metadatas:
286
+ self.compute_embeddings_from_chunks(
287
+ embedding_function=embedding_function,
288
+ faiss_persist_dir=faiss_persist_dir,
289
+ metadatas=metadatas,
290
+ )
step6_api_claims_to_evidence.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faiss
2
+ import shutil
3
+ from beartype import beartype
4
+ import numpy as np
5
+ import json
6
+ import argparse
7
+ from zsvision.zs_utils import BlockTimer
8
+ import tiktoken
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ import multiprocessing as mp
11
+ from zsvision.zs_multiproc import starmap_with_kwargs
12
+ from llm_api_utils import init_openai_with_api_key, PRICE_PER_1K_TOKENS
13
+ import multiprocessing as mp
14
+ from zsvision.zs_multiproc import starmap_with_kwargs
15
+ from langchain_community.vectorstores import FAISS
16
+ from langchain_openai import OpenAIEmbeddings
17
+ from langchain.docstore.in_memory import InMemoryDocstore
18
+
19
+
20
+ class ClaimToEvidence:
21
+ def __init__(
22
+ self,
23
+ embedding_model="ada",
24
+ limit=0,
25
+ refresh=False,
26
+ processes=1,
27
+ num_chunks_per_worker=50,
28
+ filter_str="",
29
+ text_embedding_chunk_size=500,
30
+ k_nearest_neighbours=3,
31
+ ):
32
+ self.embedding_model = embedding_model
33
+ self.limit = limit
34
+ self.refresh = refresh
35
+ self.processes = processes
36
+ self.num_chunks_per_worker = num_chunks_per_worker
37
+ self.filter_str = filter_str
38
+ self.text_embedding_chunk_size = text_embedding_chunk_size
39
+ self.k_nearest_neighbours = k_nearest_neighbours
40
+
41
+ @beartype
42
+ def link_claims_to_evidence(
43
+ self,
44
+ metas,
45
+ faiss_db,
46
+ ):
47
+ embedding_function = OpenAIEmbeddings()
48
+
49
+ # build a query from the claim and source fragment
50
+ queries = [
51
+ f"Evidence for {x['claim']} (Based on {x['verbatim_quote']})" for x in metas
52
+ ]
53
+ encoding = tiktoken.encoding_for_model(self.embedding_model)
54
+
55
+ num_tokens = len(encoding.encode(" ".join(queries)))
56
+ print(
57
+ f"Step6: Estimated cost: {num_tokens * PRICE_PER_1K_TOKENS[self.embedding_model]['embed'] / 1000:.2f} USD"
58
+ )
59
+ k_nearest_neighbours = min(
60
+ len(faiss_db.index_to_docstore_id), self.k_nearest_neighbours
61
+ )
62
+
63
+ for text_query, meta in zip(queries, metas):
64
+ docs_and_scores = faiss_db.similarity_search_with_relevance_scores(
65
+ text_query, k=k_nearest_neighbours
66
+ )
67
+
68
+ # allow evidence to be serialised
69
+ evidences = []
70
+ for document, score in docs_and_scores:
71
+ evidence = {
72
+ "chunk_tag": document.metadata["chunk_tag"],
73
+ "link": document.metadata["link"],
74
+ "query": document.metadata["query"],
75
+ "date_accessed": document.metadata["date_accessed"],
76
+ "text": document.page_content,
77
+ "similarity_score": float(score),
78
+ }
79
+ evidences.append(evidence)
80
+
81
+ meta["evidences"] = evidences
82
+ meta["embedded_query_used_to_find_evidence"] = text_query
83
+
84
+ print(f"Returning {len(metas)} queries with supporting evidence")
85
+ return metas
step7_api_check_claims_against_evidence.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import multiprocessing as mp
4
+ from zsvision.zs_multiproc import starmap_with_kwargs
5
+ from pathlib import Path
6
+ from zsvision.zs_utils import BlockTimer
7
+ from typing import List, Dict
8
+ from llm_api_utils import (
9
+ call_openai_with_exponetial_backoff,
10
+ estimate_cost_of_text_generation_api_call,
11
+ init_openai_with_api_key,
12
+ )
13
+
14
+
15
+ class CheckClaimAgainstEvidence:
16
+ def __init__(
17
+ self,
18
+ temperature=0.0,
19
+ max_num_evidences=2,
20
+ model="gpt-3.5-turbo",
21
+ src_dir=Path("data/raw"),
22
+ dest_dir=Path("data/extracted_claims"),
23
+ filter_str="",
24
+ processes=1,
25
+ refresh=False,
26
+ ):
27
+ self.temperature = temperature
28
+ self.max_num_evidences = max_num_evidences
29
+ self.model = model
30
+ self.src_dir = src_dir
31
+ self.dest_dir = dest_dir
32
+ self.filter_str = filter_str
33
+ self.processes = processes
34
+ self.refresh = refresh
35
+
36
+ def check_claim_against_evidence(
37
+ self,
38
+ claim: str,
39
+ evidences: List[Dict[str, str]],
40
+ ):
41
+ init_openai_with_api_key()
42
+ evidence_str = ""
43
+ for evidence in evidences:
44
+ # avoid linebreaks in each piece of evidence, else it can create a confusing prompt
45
+ text_evidence = evidence["text"].replace("\n", " ")
46
+ evidence_str += f"{text_evidence}\n"
47
+ evidence_str += f"URL: {evidence['link']}'\n"
48
+ evidence_str += f"Date accessed: {evidence['date_accessed']}\n\n"
49
+
50
+ prompt = f"""\
51
+ Your task is to assess whether a claim is correct based on the given pieces of evidence.
52
+
53
+ Your answer should be in json format as follows:
54
+ {{
55
+ "verdict": "<verdict>",
56
+ "justification": "<justification for the verdict>",
57
+ "quotes": ["<most relevant verbatim quotes from evidence>"],
58
+ "URLs": "<URL sources for verbatim quotes>",
59
+ "date_accessed": "<access dates for URL quotes>"
60
+ }}
61
+ The <verdict> label should be one of the following:
62
+ "Fully supported", "Partially supported", "Unsupported"
63
+
64
+ When quoting the relevant sentence from the evidence, be careful to copy it **EXACTLY** (with no edits).
65
+ ---
66
+ ## Example
67
+
68
+ **Claim**:
69
+ Hannah Arendt was born in 1906.
70
+
71
+ **Pieces of evidence**:
72
+ Hannah Arendt was a 20th-century German-Jewish political thinker and philosopher. She was born in Linden, Hanover, Germany in 1906. When she was three her family moved to Königsberg so that her father’s syphilis could be treated. He died when she was seven years old. Königsberg was where Immanuel Kant was born, right?
73
+
74
+ Königsberg was where Immanuel Kant was born, right?
75
+ URL: https://fivebooks.com/best-books/hannah-arendt-samantha-rose-hill/'
76
+ Date accessed: 2023-05-10
77
+
78
+ Hannah Arendt was born as Johanna Arendt in 1906, in the Wilhelmine period. Her German Jewish family were comfortable, educated and secular in Linden, Prussia (now a part of Hanover). They were merchants of Russian extraction from Königsberg.[a] Her grandparents were members of the Reform Jewish community. Her paternal grandfather, Max Arendt [de] (1843–1913), was a prominent businessman, local politician, a leader of the Königsberg Jewish community and a member of the Centralverein deutscher
79
+ URL: https://en.wikipedia.org/wiki/Hannah_Arendt'
80
+ Date accessed: 2023-05-10
81
+
82
+
83
+ **Assessment**:
84
+ {{
85
+ "verdict": "Fully supported",
86
+ "justification": "The claim about Hannah Arendt's birth date is directly supported by the evidence."
87
+ "quote": "Hannah Arendt was born as Johanna Arendt in 1906, in the Wilhelmine period.",
88
+ "URL": "https://en.wikipedia.org/wiki/Hannah_Arendt",
89
+ "date_accessed": "2023-05-10"
90
+ }}
91
+ ---
92
+ **Claim**:
93
+ {claim}
94
+
95
+ **Pieces of evidence**:
96
+ {evidence_str}
97
+ **Assessment**:\
98
+ """
99
+ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles."
100
+ system_message = {"role": "system", "content": persona}
101
+ user_message = {"role": "user", "content": prompt}
102
+ messages = [system_message, user_message]
103
+
104
+ with BlockTimer(
105
+ f"Using OpenAI API to check claims against evidence {self.model}"
106
+ ):
107
+ response = call_openai_with_exponetial_backoff(
108
+ model=self.model,
109
+ temperature=self.temperature,
110
+ messages=messages,
111
+ response_format={"type": "json_object"},
112
+ )
113
+
114
+ cost = estimate_cost_of_text_generation_api_call(
115
+ model=self.model, response=response, verbose=True
116
+ )
117
+
118
+ assessment = response.choices[0].message.content
119
+ assessment_dict = json.loads(assessment)
120
+ return {"assessment": assessment_dict, "cost": cost}
121
+
122
+ def check_claims_against_evidence(self, claims_with_evidence):
123
+ """
124
+ Checks claims against evidence.
125
+ """
126
+ kwarg_list = []
127
+ results = []
128
+ for idx, item in enumerate(claims_with_evidence):
129
+ kwarg_list.append(
130
+ {
131
+ "claim": item["claim"],
132
+ "evidences": item["evidences"][: self.max_num_evidences],
133
+ }
134
+ )
135
+ if self.processes == 1:
136
+ for kwargs in kwarg_list:
137
+ results.append(self.check_claim_against_evidence(**kwargs))
138
+ else: # multiprocess
139
+ func = self.check_claim_against_evidence
140
+ with mp.Pool(processes=self.processes) as pool:
141
+ results = starmap_with_kwargs(
142
+ pool=pool, func=func, kwargs_iter=kwarg_list
143
+ )
144
+ costs = [result["cost"] for result in results]
145
+ print(f"Total cost: {sum(costs)} USD")
146
+ assessed_claims = []
147
+ for result, item in zip(results, claims_with_evidence):
148
+ item["assessment"] = result["assessment"]
149
+ item["verdict_model"] = self.model
150
+ assessed_claims.append(item)
151
+
152
+ print(f"Writing {len(assessed_claims)} assessed claims")
153
+ return assessed_claims
step8_api_format_fact_checked_document.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import re
3
+ from collections import defaultdict
4
+ import json
5
+ from text_utils import find_matching_indices
6
+ from pathlib import Path
7
+
8
+
9
+ class FormatDocument:
10
+ def __init__(
11
+ self,
12
+ footnote_style: str,
13
+ temperature=0.0,
14
+ model="gpt-4",
15
+ dest_dir=Path("data/extracted_claims"),
16
+ filter_str="",
17
+ refresh=False,
18
+ ):
19
+ self.temperature = temperature
20
+ self.model = model
21
+ self.dest_dir = dest_dir
22
+ self.filter_str = filter_str
23
+ self.refresh = refresh
24
+ self.footnote_style = footnote_style
25
+
26
+ def cleanup_explanation(self, claim_assessment: dict, mode: str) -> str:
27
+ claim = claim_assessment["claim"]
28
+ assessment = claim_assessment["assessment"]
29
+ justification = assessment["justification"]
30
+ category = assessment["verdict"]
31
+ urls = assessment["URLs"]
32
+ date_accessed = assessment["date_accessed"]
33
+
34
+ prefixes = {
35
+ "Fully supported": "✅",
36
+ "Partially supported": "❓",
37
+ "Unsupported": "❗",
38
+ }
39
+ prefix = prefixes[category]
40
+ quotes = ",".join(f'"{quote}"' for quote in assessment["quotes"])
41
+ # Sometimes, the verdict justification contains newlines , which messes up the formatting of footnotes.
42
+ justification = justification.replace("\n", "")
43
+
44
+ if mode == "terse":
45
+ footnote = f"Claim: {claim} 👉 {category} {urls}"
46
+ elif mode == "verbose":
47
+ footnote = f"Claim: {claim} 👉 {category} {quotes} {justification}, URLs: {urls}, date accessed: {date_accessed}"
48
+ footnote = f"{prefix} {footnote}"
49
+ return footnote
50
+
51
+ def reformat_document_to_include_claims(
52
+ self,
53
+ original_text,
54
+ fact_verdicts,
55
+ footnote_style=None,
56
+ ):
57
+ bibliography = []
58
+ footnote_markers_to_insert = []
59
+ statistics = defaultdict(int)
60
+ number_of_facts_checked = 0
61
+ if footnote_style:
62
+ self.footnote_style = footnote_style
63
+ for fact_idx, claim_assessment in enumerate(fact_verdicts):
64
+ if self.footnote_style == "terse":
65
+ footnote_str = f"{fact_idx + 1}"
66
+ elif self.footnote_style == "verbose":
67
+ footnote_str = claim_assessment["claim"].replace(" ", "-")
68
+ # footnote markers cannot contain much punctuation or commas in Jekyll
69
+ # (even though this is valid in GitHub-flavoured markdown)
70
+ for char in [
71
+ ",",
72
+ ".",
73
+ '"',
74
+ "'",
75
+ ":",
76
+ ";",
77
+ "(",
78
+ ")",
79
+ "[",
80
+ "]",
81
+ "{",
82
+ "}",
83
+ "*",
84
+ ]:
85
+ footnote_str = footnote_str.replace(char, "")
86
+
87
+ explanation = self.cleanup_explanation(
88
+ claim_assessment, mode=self.footnote_style
89
+ )
90
+ footnote_marker = f"[^{footnote_str}]"
91
+ query = claim_assessment["verbatim_quote"]
92
+
93
+ assert (
94
+ original_text.count(query) == 1
95
+ ), f"Found {original_text.count(query)} matches for {query}, rather than 1"
96
+ start_pos = original_text.find(query)
97
+ assert start_pos != -1, f"Could not find {query} in {original_text}"
98
+ end_pos = start_pos + len(query)
99
+ footnote_markers_to_insert.append((end_pos, footnote_marker))
100
+ verdict_category = claim_assessment["assessment"]["verdict"]
101
+ statistics[verdict_category] += 1
102
+ number_of_facts_checked += 1
103
+ bibliography.append(f"{footnote_marker}: {explanation} ")
104
+
105
+ # perform insertions in reverse order so that the indices don't get messed up
106
+ modified_text = original_text
107
+ for char_pos, footnote_marker in sorted(
108
+ footnote_markers_to_insert, reverse=True
109
+ ):
110
+ modified_text = (
111
+ modified_text[:char_pos] + footnote_marker + modified_text[char_pos:]
112
+ )
113
+
114
+ modified_text += "\n\n"
115
+ modified_text += "\n".join(bibliography)
116
+
117
+ # assert number_of_facts_checked != 0, "No facts were checked"
118
+ if number_of_facts_checked == 0:
119
+ print("No objective facts were found.")
120
+ modified_text = "No clear-cut objective claims were detected."
121
+ return modified_text
subjective_claims.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Arendt's experiences during this time influenced her work on totalitarianism and human rights.
text_utils.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict
3
+ import unittest
4
+
5
+
6
+ def parse_passage_quote_and_claim(passage_quote_and_claim: str) -> Dict[str, str]:
7
+ """Parse the quote and claim from a string, where the string is of the form:
8
+
9
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] <passage quote for claim> [CLAIM] <claim>
10
+ """
11
+
12
+ if not passage_quote_and_claim.startswith("[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]"):
13
+ raise ValueError(f"Invalid input format: {passage_quote_and_claim}")
14
+
15
+ parts = passage_quote_and_claim.split("[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]")
16
+ source_parts = parts[1].split("[CLAIM]")
17
+
18
+ # If there aren't exactly two parts after splitting by [CLAIM], the format is invalid
19
+ if len(source_parts) != 2:
20
+ raise ValueError(f"Invalid input format: {passage_quote_and_claim}")
21
+
22
+ passage_quote_for_claim = source_parts[0].strip()
23
+ claim = source_parts[1].strip()
24
+ return {"verbatim_quote": passage_quote_for_claim, "claim": claim}
25
+
26
+
27
+ def is_unique_verbatim_quote(verbatim_quote: str, original_passage: str):
28
+ """Check if the verbatim quote is an exact quote from the original passage."""
29
+ return original_passage.count(verbatim_quote) == 1
30
+
31
+
32
+ def find_matching_indices(query: str, original_text: str):
33
+ # Function to remove markdown links and create an index map
34
+ def remove_links(text):
35
+ index_map = []
36
+ result = []
37
+ markdown_links = re.finditer(r"\[([^\]]+)\]\([^)]+\)", text)
38
+
39
+ prev_end = 0
40
+ for match in markdown_links:
41
+ result.append(text[prev_end : match.start()])
42
+ index_map.extend(range(prev_end, match.start()))
43
+ result.append(match.group(1))
44
+ index_map.extend(range(match.start(1), match.end(1)))
45
+ prev_end = match.end()
46
+
47
+ result.append(text[prev_end:])
48
+ index_map.extend(range(prev_end, len(text)))
49
+
50
+ return "".join(result), index_map
51
+
52
+ # Remove markdown links from the original text and create an index map
53
+ cleaned_text, index_map = remove_links(original_text)
54
+
55
+ # Remove markdown links from the query
56
+ cleaned_query, _ = remove_links(query)
57
+
58
+ # Find the start index of the cleaned query in the cleaned text
59
+ start = cleaned_text.find(cleaned_query)
60
+
61
+ # If the query is not found, return an empty list
62
+ if start == -1:
63
+ return []
64
+
65
+ # Add the query length to get the end index
66
+ end = start + len(cleaned_query)
67
+
68
+ # Use the index map to find the corresponding start and end indices in the original text
69
+ original_start = index_map[start]
70
+ original_end = index_map[end - 1] + 1
71
+
72
+ return [(original_start, original_end)]
73
+
74
+
75
+ class TestCases(unittest.TestCase):
76
+ def test_find_matching_indices(self):
77
+ """Test the find_matching_indices() function.
78
+ This function should return a list of matches, where each match is a tuple of (start, end) indices.
79
+
80
+ The start and end indices should be the character positions of the query in the original_text, accounting
81
+ for the fact that markdown links should be ignored when performing the match.
82
+
83
+ """
84
+ test_cases = [
85
+ {
86
+ "query": "Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like Immanuel Kant and Edmund Husserl.",
87
+ "original": "Arendt's later works, sought to further unravel the complexities of power and rebellion. Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work).\n\n## A Lasting Legacy",
88
+ "expected": "Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work).",
89
+ },
90
+ {
91
+ "query": "I went to the sea side (at the weekend).",
92
+ "original": "I woke up. Then I went to the sea side (at the weekend). Then I went home.",
93
+ "expected": "I went to the sea side (at the weekend).",
94
+ },
95
+ {
96
+ "query": "no merger with the [solar farm] company",
97
+ "original": "There would be no merger with the [solar farm] company.",
98
+ "expected": "no merger with the [solar farm] company",
99
+ },
100
+ {
101
+ "query": "with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work)",
102
+ "original": "\n\n## Fleeing Germany and the Road to Academia\n\nWith the rise of the Nazi regime in the 1930s, Arendt's Jewish heritage put her in grave danger. She fled Germany in 1933 and settled in Paris, where she became involved with a number of political and social organizations advocating for Jewish refugees. In 1940, she was interned in a French camp as an enemy alien, but managed to escape and eventually make her way to the United States in 1941.\n\nArendt's experiences during this time would deeply influence her work on totalitarianism and human rights. In New York, she began to immerse herself in academic life, working as an editor, journalist, and lecturer. Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy.\n\n## A Life Of Controversial, Influential Works\n\nThroughout her career, Arendt wrote a number of seminal, and controversial, works. *The Human Condition* (1958) examined the role of politics in modern societies and introduced the concept of \"the public realm\" – the space where individuals act and participate in political life. This exploration of freedom and action would become a recurring theme in her writings.\n\nHer 1963 publication, *Eichmann in Jerusalem: A Report on the Banality of Evil*, based on her coverage of Adolf Eichmann's trial, ignited significant controversy. Arendt argued that Eichmann, a key architect of the Holocaust, was not a monster but rather an ordinary bureaucrat who unquestioningly followed orders. The idea of the \"banality of evil\" continues to influence discussions on the nature of evil and moral responsibility.\n\nArendt's later works, such as *On Revolution* (1963) and *Between Past and Future* (1968), sought to further unravel the complexities of power, authority, and rebellion. Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work).\n\n## A Lasting Legacy\n\nHannah Arendt died in 1975, but her work remains as relevant as ever.",
103
+ "expected": "with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work)",
104
+ },
105
+ ]
106
+
107
+ for test_case in test_cases:
108
+ matches = find_matching_indices(
109
+ query=test_case["query"], original_text=test_case["original"]
110
+ )
111
+ assert (
112
+ len(matches) == 1
113
+ ), f"Expected exactly one match, but found {len(matches)}"
114
+ result = test_case["original"][matches[0][0] : matches[0][1]]
115
+ msg = (
116
+ f"Expected\n\n{test_case['expected']}\n\nbut instead found\n\n{result}"
117
+ )
118
+ self.assertEqual(result, test_case["expected"], msg)
119
+ print(f"Passed all tests for find_matching_indices()")
120
+
121
+ def test_parse_passage_quote_and_claim(self):
122
+ """Test the following function:
123
+ parse_passage_quote_and_claim(passage_quote_and_claim: str) -> {"verbatim_quote": str, "claim": str}
124
+
125
+ The passage quote and claim should take the form:
126
+ [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] <passage quote for claim> [CLAIM] <claim>
127
+ """
128
+ test_cases = [
129
+ {
130
+ "passage_quote_and_claim": "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Hannah Arendt [was born in] 1906 in Linden, Germany [CLAIM] Hannah Arendt was born in Linden, Germany.",
131
+ "expected": {
132
+ "verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany",
133
+ "claim": "Hannah Arendt was born in Linden, Germany.",
134
+ },
135
+ },
136
+ {
137
+ "passage_quote_and_claim": "Something [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Hannah Arendt [was born in] 1906 in Linden, Germany [CLAIM] Hannah Arendt was born in Linden, Germany.",
138
+ "expected": "Exception",
139
+ },
140
+ ]
141
+ for test_case in test_cases:
142
+ expected = test_case["expected"]
143
+ if expected == "Exception":
144
+ self.assertRaises(
145
+ ValueError,
146
+ parse_passage_quote_and_claim,
147
+ test_case["passage_quote_and_claim"],
148
+ )
149
+ else:
150
+ parsed = parse_passage_quote_and_claim(
151
+ passage_quote_and_claim=test_case["passage_quote_and_claim"]
152
+ )
153
+ self.assertEqual(parsed["verbatim_quote"], expected["verbatim_quote"])
154
+
155
+ def test_is_unique_verbatim_quote_check(self):
156
+ """Test the following function:
157
+ is_unique_verbatim_quote_check(verbatim_quote: str) -> bool
158
+
159
+ This function should return True if the verbatim quote is indeed a quote and is unique, and false otherwise.
160
+
161
+ """
162
+ test_cases = [
163
+ {
164
+ "verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany",
165
+ "original_passage": "Hannah Arendt [was born in] 1906 in Linden, Germany at a time when...",
166
+ "expected": True,
167
+ },
168
+ {
169
+ "verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany",
170
+ "original_passage": "Hannah Arendt [wasn't born in] 1906 in Linden, Germany at a time when...",
171
+ "expected": False,
172
+ },
173
+ {
174
+ "verbatim_quote": "Hannah Arendt [was born in] 1906 in Linden, Germany. Hannah Arendt was a person.",
175
+ "original_passage": "Hannah Arendt",
176
+ "expected": False,
177
+ },
178
+ ]
179
+ for test_case in test_cases:
180
+ result = is_unique_verbatim_quote(
181
+ verbatim_quote=test_case["verbatim_quote"],
182
+ original_passage=test_case["original_passage"],
183
+ )
184
+ self.assertEqual(result, test_case["expected"])
185
+
186
+
187
+ if __name__ == "__main__":
188
+ unittest.main()
utils.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from functools import lru_cache
3
+ from langchain_community.utilities import GoogleSearchAPIWrapper
4
+
5
+ @lru_cache(maxsize=2)
6
+ def get_search_wrapper():
7
+ os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_CLOUD_API_KEY")
8
+
9
+ os.environ["GOOGLE_CSE_ID"] = os.environ.get("GOOGLE_CUSTOM_SEARCH_ENGINE_ID")
10
+ return GoogleSearchAPIWrapper()
11
+
12
+
13
+ def get_google_search_results(query_str: str, num_results: int):
14
+ google_search_tool = get_search_wrapper()
15
+ search_results = google_search_tool.results(
16
+ query=query_str, num_results=num_results
17
+ )
18
+ return search_results