import csv import time import pandas as pd from chatgpt_detector_roberta import ( check_human, detect_ai_content, ) from search_text import detect_by_relative_search HUMAN = "HUMAN" MACHINE = "MACHINE" def read_csv_column(file_path, column_name, data_size=100): """ Reads a CSV file and extracts data from the specified column. Args: filename: Path to the CSV file. column_name: Name of the column to extract data from. Returns: A list containing the data from the specified column. """ try: df = pd.read_csv(file_path) column_data = df[column_name].tolist() return column_data[:data_size] except FileNotFoundError: print(f"Error: File '{file_path}' not found.") return [] except KeyError: print(f"Error: Column '{column_name}' not found in the CSV file.") return [] def evaluation(texts): results = [] index = 0 for text in texts: if index <= 82: print(f"index = {index}") index += 1 continue # Classify by SOTA model # SOTA_prediction, SOTA_confidence = detect_by_huggingface_model(text) SOTA_prediction, SOTA_confidence = detect_ai_content(text) # Classify by search engine # is_paraphrased, _, data = find_by_relative_search(text) is_paraphrased, _, data = detect_by_relative_search(text) if not is_paraphrased: search_engine_prediction = "UNKNOWN" else: if check_human(data): search_engine_prediction = HUMAN else: search_engine_prediction = MACHINE print( f"RESULTS:\t{SOTA_prediction}\t{search_engine_prediction}" ) results.append( (index, SOTA_prediction, SOTA_confidence, search_engine_prediction) ) with open("eva_bbc_test.csv", "a", newline="") as csvfile: #with open("eva_MAGE_test.csv", "a", newline="") as csvfile: writer = csv.writer(csvfile) writer.writerow( [index, SOTA_prediction, SOTA_confidence, search_engine_prediction] ) index += 1 time.sleep(1) # avoid 100? queries per minute limit # Define the column names # columns = [ # "index", # "SOTA_prediction", # "SOTA_confidence", # "search_engine_prediction", # ] # # Create the DataFrame # df = pd.DataFrame(results, columns=columns) # # Statistics # search_engine_acc = df["search_engine_prediction"].value_counts()[ # "HUMAN" # ] / len(df) # SOTA_acc = df["SOTA_prediction"].value_counts()["HUMAN"] / len(df) # # Filter the DataFrame based on the given conditions # filtered_df = df[ # (df["SOTA_prediction"] == "MACHINE") # & (df["search_engine_prediction"] == "HUMAN") # ] # print(f"Total data: {len(df)}") # print(f"SOTA accuracy: {SOTA_acc}") # print(f"Search engine accuracy: {search_engine_acc}") # print(f"Correction sample: {len(filtered_df)}") def extract_machine_data(file_path): df = pd.read_csv(file_path) machine_data = df[df["src"] == "xsum_machine_topical_gpt-3.5-trubo"] # write to file machine_data.to_csv("machine_data.csv", index=False) def extract_human_data(file_path): df = pd.read_csv(file_path) machine_data = df[df["src"] == "xsum_human"] # write to file machine_data.to_csv("machine_data.csv", index=False) if __name__ == "__main__": # extract_machine_data('data/test_data/test.csv') # BBC file_path = "data/test_data/test_100_bbc.csv" column_name = "content" # MAGE # file_path = "data/test_data/test_100_MAGE.csv" # column_name = "text" contents = read_csv_column( file_path=file_path, column_name=column_name, data_size=100, ) evaluation(contents)