import argparse import json import multiprocessing as mp from zsvision.zs_multiproc import starmap_with_kwargs from zsvision.zs_utils import BlockTimer from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim from llm_api_utils import ( call_openai_with_exponetial_backoff, estimate_cost_of_text_generation_api_call, init_openai_with_api_key, ) from langchain.text_splitter import RecursiveCharacterTextSplitter class ClaimExtractor: def __init__( self, temperature=0, model="gpt-3.5-turbo", filter_str="", processes=1, refresh=False, ): """Initializes ClaimExtractor with the provided arguments""" self.temperature = temperature self.model = model self.filter_str = filter_str self.processes = processes self.refresh = refresh def extract_claims_from_passage( self, idx: int, total: int, passage: str, ): init_openai_with_api_key() print(f"Processing passage {idx + 1} of {total}") prompt = f"""\ Task: Enumerate all the discrete factual claims or logical assertions stated in the passage that follows the dashed horizontal line below. \ To allow the claims to be linked to the passage, use the format: `VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: , CLAIM: ` on each line. \ The must be A SINGLE UNEDITED SUBSTRING from the passage that uniquely identifies the claim. \ The must carefully preserve all punctuation and clauses from the original passage. \ This text will be used in the final national exam. ---------- Here is an example passage, together with the verbatim passage quotes and claims that should be extracted from it: Passage: Immanuel Kant was born in 1724 into a modest, devoutly religious family, with his father working as a saddle-maker. \ He was one of nine children, but only five, including Kant, survived to adulthood. \ His upbringing was steeped in the Pietist tradition, emphasizing intense religious devotion, a literal interpretation of the Bible, and a strong focus on personal morality. \ Kant attended the University of Königsberg, studying various subjects, including theology, metaphysics, and natural science. \ After completing his studies, Kant worked as a private tutor for nine years before returning to the University of Königsberg as a lecturer in 1755. \ In his works Groundwork of the Metaphysics of Morals (1785) and Critique of Practical Reason (1788), Kant argues that morality is not contingent upon personal desires or cultural norms. \ Extracted source phrases and claims: [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born in 1724. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a modest family. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Immanuel Kant was born in 1724 into a modest, devoutly religious family [CLAIM] Immanuel Kant was born into a devoutly religious family. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] with his father working as a saddle-maker [CLAIM] Immnauel Kant's father worked as a saddle-maker. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] He was one of nine children [CLAIM] Immanuel Kant was one of nine children. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] but only five, including Kant survived to adulthood [CLAIM] Only five of Immanuel Kant's parents' children survived to adulthood. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] His upbringing was steeped in the Pietist tradition [CLAIM] Immanuel Kant's upbringing was steeped in the Pietist tradition. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] emphasizing intense religious devotion [CLAIM] Immanuel Kant's upbringing emphasized intense religious devotion. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a literal interpretation of the Bible [CLAIM] Immanuel Kant's upbringing emphasized a literal interpretation of the Bible. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] a strong focus on personal morality [CLAIM] Immanuel Kant's upbringing emphasized a strong focus on personal morality. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] Kant attended the University of Königsberg [CLAIM] Immanuel Kant attended the University of Königsberg. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied theology. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied metaphysics. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] studying various subjects, including theology, metaphysics, and natural science [CLAIM] Immanuel Kant studied natural science. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies [CLAIM] Immanuel Kant completed his studies. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] After completing his studies, Kant worked as a private tutor for nine years [CLAIM] After completing his studies, Immanuel Kant worked as a private tutor. [VERBATIM_PASSAGE_QUOTE_FOR_CLAIM] before returning to the University of Königsberg as a lecturer in 1755 [CLAIM] Immanuel Kant returned to the University of Königsberg as a lecturer in 1755. ---------- Passage: {passage} Extracted source phrases and claims:\ """ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles." system_message = {"role": "system", "content": persona} user_message = {"role": "user", "content": prompt} messages = [system_message, user_message] with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"): response = call_openai_with_exponetial_backoff( model=self.model, temperature=self.temperature, messages=messages, ) cost = estimate_cost_of_text_generation_api_call( model=self.model, response=response, verbose=True ) content = response.choices[0].message.content content = content.strip() quotes_and_claims = content.split("\n") parsed_claims = [] for quote_and_claim in quotes_and_claims: quote_and_claim = quote_and_claim.strip() if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim: quote_and_claim = quote_and_claim.replace( "VERBATIM_PASSAGE_QUOTE_FOR_CLAIM: ", "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]: ", ) if "[CLAIM]" not in quote_and_claim: quote_and_claim = quote_and_claim.replace(" CLAIM:", " [CLAIM]:") if "[VERBATIM_PASSAGE_QUOTE_FOR_CLAIM]" not in quote_and_claim: continue quote_and_claim = quote_and_claim.strip() parsed = parse_passage_quote_and_claim(quote_and_claim) is_unique_and_verbatim = is_unique_verbatim_quote( verbatim_quote=parsed["verbatim_quote"], original_passage=passage ) parsed["is_unique_and_verbatim"] = is_unique_and_verbatim parsed_claims.append(parsed) return {"claims": parsed_claims, "cost": cost} def extract_claims(self, text_input): """ Extracts claims from text_input and return the extracted claims in a json file """ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.create_documents([text_input]) print(f"Split text into {len(docs)} documents") all_claims = [] kwarg_list = [] for idx, doc in enumerate(docs): # remove newlines from the passage to avoid a confusing prompt format passage = doc.page_content.replace("\n", " ") kwarg_list.append( { "idx": idx, "total": len(docs), "passage": passage, } ) if self.processes == 1: results = [] for kwargs in kwarg_list: results.append(self.extract_claims_from_passage(**kwargs)) else: # multiprocess func = self.extract_claims_from_passage with mp.Pool(processes=self.processes) as pool: results = starmap_with_kwargs( pool=pool, func=func, kwargs_iter=kwarg_list ) cost = sum([result["cost"] for result in results]) all_claims = [] for result in results: all_claims.extend(result["claims"]) print(f"Returning {len(all_claims)} claims (cost: {cost} USD)") return all_claims