import json import argparse import multiprocessing as mp from zsvision.zs_multiproc import starmap_with_kwargs from typing import List, Dict import numpy as np from zsvision.zs_utils import BlockTimer from llm_api_utils import ( call_openai_with_exponetial_backoff, estimate_cost_of_text_generation_api_call, init_openai_with_api_key, ) import random class ClassifyClaims: def __init__( self, temperature=0, model="gpt-3.5-turbo", max_claims_per_api_call=10, processes=8, filter_str="", refresh=False, ): self.temperature = temperature self.model = model self.max_claims_per_api_call = max_claims_per_api_call self.processes = processes self.filter_str = filter_str self.refresh = refresh self.objective_claims_file = "objective_claims.txt" self.subjective_claims_file = "subjective_claims.txt" def parse_classification_label(self, text: str) -> str: raw = text.strip() if raw.endswith("[objective]"): label = "objective" elif raw.endswith("[subjective]"): label = "subjective" else: raise ValueError(f"Invalid label: {raw}") return label def read_file(self, file_name): with open(file_name, "r") as f: lines = [] for line in f: parsed_line = line.strip() lines.append(parsed_line) return lines def create_few_shot_learning_prompt(self) -> str: objective_list = self.read_file(self.objective_claims_file) subjective_list = self.read_file(self.subjective_claims_file) merged_list = list( zip(objective_list, ["[objective]"] * len(objective_list)) ) + list(zip(subjective_list, ["[subjective]"] * len(subjective_list))) # Randomizing the merged list with a specific seed seed = 1234 random.seed(seed) random.shuffle(merged_list) prompt = "Claims:\n" for claim, _ in merged_list: prompt += claim + "\n" prompt += "\nClassifications:\n" for claim, classif in merged_list: prompt += claim + " " + classif + "\n" return prompt def classify_claim_batch( self, idx: int, total: int, claims_and_sources_batch: List[Dict[str, str]], ): print( f"Processing batch {idx+1} of {total} (containing {len(claims_and_sources_batch)} claims)" ) claim_str = "\n".join([claim["claim"] for claim in claims_and_sources_batch]) num_batch_claims = len(claims_and_sources_batch) few_shot = self.create_few_shot_learning_prompt() prompt = f"""\ Objective claims can be verified based on factual data (such as those that could be verified by \ referencing an encyclopedia), whereas subjective claims involve a personal interpretation of \ the data and are more open to debate. \ For each of the following claims given below the dashed horizontal line, classify them as \ [subjective] or [objective] by suffixing the claim with the appropriate label. OUTPUT ONLY the class, either subjective or objective for each claim! Here are some examples: {few_shot} ---------- Claims: {claim_str} Classifications:\ """ persona = "You are a careful research assistant who helps with fact-checking and editing informative articles." system_message = {"role": "system", "content": persona} user_message = {"role": "user", "content": prompt} messages = [system_message, user_message] with BlockTimer(f"Using OpenAI API to extract claims with {self.model}"): response = call_openai_with_exponetial_backoff( model=self.model, temperature=self.temperature, messages=messages, ) cost = estimate_cost_of_text_generation_api_call( model=self.model, response=response, verbose=True ) proposed_classified_claims = response.choices[0].message.content batch_classified_claims = proposed_classified_claims.split("\n") content = response.choices[0].message.content batch_classified_claims = content.split("\n") assert ( len(batch_classified_claims) == num_batch_claims ), f"Expected {num_batch_claims} claims, but got {len(batch_classified_claims)}" print(f"Generated {len(batch_classified_claims)} claims (cost: {cost:.4f} USD)") claims_with_labels = [] for claim_and_source, classified_claim in zip( claims_and_sources_batch, batch_classified_claims ): claim_label = self.parse_classification_label(classified_claim) claim_and_source["label"] = claim_label claims_with_labels.append(claim_and_source) return {"claims_with_labels": claims_with_labels, "cost": cost} def classify_claims(self, claims_and_sources): """ Classify claims as being either subjective or objective, and write the results to a file. """ init_openai_with_api_key() num_claims = len(claims_and_sources) # we limit the number of claims per api call (otherwise GPT-4 can choke) num_batches = int(np.ceil(num_claims / self.max_claims_per_api_call)) claims_and_sources_batches = [ batch.tolist() for batch in np.array_split(claims_and_sources, num_batches) ] kwarg_list = [] for idx, claims_and_sources_batch in enumerate(claims_and_sources_batches): # remove newlines from the passage to avoid a confusing prompt format kwarg_list.append( { "idx": idx, "total": len(claims_and_sources_batches), "claims_and_sources_batch": claims_and_sources_batch, } ) if self.processes == 1: batch_results = [] for kwargs in kwarg_list: batch_results.append(self.classify_claim_batch(**kwargs)) else: # multiprocess func = self.classify_claim_batch with mp.Pool(processes=self.processes) as pool: batch_results = starmap_with_kwargs( pool=pool, func=func, kwargs_iter=kwarg_list ) cost = sum([result["cost"] for result in batch_results]) labelled_claims = [] for batch in batch_results: labelled_claims.extend(batch["claims_with_labels"]) print(f"Returning {len(labelled_claims)} claims (cost: {cost} USD)") return labelled_claims def filter_to_objective_claims(self, claims): """Filter claims to only those that are objective.""" objective_claims = [claim for claim in claims if claim["label"] == "objective"] print(f"Returning {len(objective_claims)} objective claims") return objective_claims