import argparse import json import multiprocessing as mp from zsvision.zs_multiproc import starmap_with_kwargs from zsvision.zs_utils import BlockTimer from text_utils import is_unique_verbatim_quote, parse_passage_quote_and_claim from llm_api_utils import ( call_openai_with_exponetial_backoff, estimate_cost_of_text_generation_api_call, init_openai_with_api_key, ) class FixAnchors: def __init__( self, temperature=0, model="gpt-3.5-turbo", filter_str="", processes=8, refresh=False, ): self.temperature = temperature self.model = model self.filter_str = filter_str self.processes = processes self.refresh = refresh def fix_passage_anchor( self, idx: int, total: int, original_passage: str, claim_with_metadata: dict, ): init_openai_with_api_key() print(f"Processing claim with metadata {idx + 1} of {total}") # we remove newlines original_passage = original_passage.replace("\n", " ") assert not claim_with_metadata[ "is_unique_and_verbatim" ], "We should only fix broken passage anchors" prompt = f"""\ Task: A co-worker was tasked with identifying a unique, verbatim quote from a passage that underpins a particular claim. \ Unfortunately, they made a mistake and the quote they identified is not unique and verbatim. \ Your task is to fix their quote so that it is both verbatim and unique. ----- Here is an example passage, together with the claim and the erroneous quote. Passage: In 1940, she was interned in a French camp as an enemy alien, but managed to escape and eventually make her way to the United States in 1941. \ Arendt's experiences during this time would deeply influence her work on totalitarianism and human rights. \ In New York, she began to immerse herself in academic life, working as an editor, journalist, and lecturer. \ Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy. \ ## A Life Of Controversial, Influential Works \ Throughout her career, Arendt wrote a number of seminal, and controversial, works. *The Human Condition* (1958) examined the role of politics in modern societies and introduced the concept of "the public realm" – the space where individuals act and participate in political life. \ This exploration of freedom and action would become a recurring theme in her writings. \ Her 1963 publication, *Eichmann in Jerusalem: A Report on the Banality of Evil*, based on her coverage of Adolf Eichmann's trial, ignited significant controversy. \ Arendt argued that Eichmann, a key architect of the Holocaust, was not a monster but rather an ordinary bureaucrat who unquestioningly followed orders. \ The idea of the "banality of evil" continues to influence discussions on the nature of evil and moral responsibility. \ Arendt's later works, such as *On Revolution* (1963) and *Between Past and Future* (1968), sought to further unravel the complexities of power, authority, and rebellion. \ Her writings on these subjects continue to resonate with present-day political struggles, as well as with the works of other philosophers like [Immanuel Kant](/philosophy/2023-immanuel-kant-life-and-work) and [Edmund Husserl](/philosophy/2023-edmund-husserl-his-life-and-work). \ Claim: *The Origins of Totalitarianism* established Arendt as a significant voice in political philosophy. Initial attempt at a unique and verbatim quote: [The Origins of Totalitarianism] established her as a significant voice in political philosophy. Correct (unique and verbatim) quote: Her first major work, *The Origins of Totalitarianism*, published in 1951, explored the common roots of Nazism and Stalinism, and established her as a significant voice in political philosophy. ----- Passage: {original_passage} Claim: {claim_with_metadata["claim"]} Initial attempt at a unique verbatim quote: {claim_with_metadata["verbatim_quote"]} Correct (unique and verbatim) quote:\ """ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles." system_message = {"role": "system", "content": persona} user_message = {"role": "user", "content": prompt} messages = [system_message, user_message] with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"): response = call_openai_with_exponetial_backoff( model=self.model, temperature=self.temperature, messages=messages, ) cost = estimate_cost_of_text_generation_api_call( model=self.model, response=response, verbose=True ) content = response.choices[0].message.content verbatim_quote = content.rstrip() is_unique_and_verbatim = is_unique_verbatim_quote( verbatim_quote=verbatim_quote, original_passage=original_passage ) assert ( is_unique_and_verbatim ), f"Failed to fix passage anchor: {claim_with_metadata['verbatim_quote']} was updated to {verbatim_quote} but is not unique and verbatim" claim_with_metadata["verbatim_quote"] = verbatim_quote return {"claim_with_metadata": claim_with_metadata, "cost": cost} def fix_passage_anchors(self, claims_with_metadata, original_passage: str): kwarg_list = [] valid_claims_with_metadata = [] invalid_claims_with_metadata = [] for idx, claim_with_metadata in enumerate(claims_with_metadata): # remove newlines from the passage to avoid a confusing prompt format if not claim_with_metadata["is_unique_and_verbatim"]: invalid_claims_with_metadata.append(claim_with_metadata) else: valid_claims_with_metadata.append(claim_with_metadata) for idx, claim_with_metadata in enumerate(invalid_claims_with_metadata): kwarg_list.append( { "idx": idx, "total": len(invalid_claims_with_metadata), "claim_with_metadata": claim_with_metadata, "original_passage": original_passage, } ) if self.processes == 1: results = [] for kwargs in kwarg_list: try: results.append(self.fix_passage_anchor(**kwargs)) except Exception as e: print(f"Exception in step2: {e}, model: {self.model}") print("Skipping this claim!") if self.model == "gpt-4": pass else: raise e else: # multiprocess func = self.fix_passage_anchor with mp.Pool(processes=self.processes) as pool: results = starmap_with_kwargs( pool=pool, func=func, kwargs_iter=kwarg_list ) cost = sum([result["cost"] for result in results]) for result in results: valid_claims_with_metadata.append(result["claim_with_metadata"]) # remove the is_unique_and_verbatim field (no longer needed) for claim_with_metadata in valid_claims_with_metadata: del claim_with_metadata["is_unique_and_verbatim"] print( f"Returning {len(valid_claims_with_metadata)} claims with metadat (cost: {cost} USD)" ) return valid_claims_with_metadata