import argparse import pandas as pd from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize from collections import Counter import string import os import streamlit as st # Ensure you've downloaded the set of stop words the first time you run this import nltk # only download if they don't exist if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')): nltk.download('punkt') nltk.download('stopwords') def preprocess_document(doc): """ Tokenizes, removes punctuation, stopwords, and stems words in a single document. """ # Lowercase doc = doc.lower() # Remove punctuation doc = doc.translate(str.maketrans('', '', string.punctuation)) # Tokenize tokens = word_tokenize(doc) # Remove stop words stop_words = set(stopwords.words('english')) filtered_tokens = [word for word in tokens if word not in stop_words] # Stemming stemmer = PorterStemmer() stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] return stemmed_tokens @st.cache_data def find_dividing_words(documents): """ Identifies candidate words that might split the set of documents into two groups. """ all_words = [] per_doc_word_counts = [] i = 0 for doc in documents: print(i) preprocessed_doc = preprocess_document(doc) all_words.extend(preprocessed_doc) per_doc_word_counts.append(Counter(preprocessed_doc)) i += 1 # Overall word frequency overall_word_counts = Counter(all_words) # Find words that appear in roughly half the documents num_docs = len(documents) candidate_words = [] for word, count in overall_word_counts.items(): doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0) if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs: candidate_words.append(word) print("Done with dividing words") return candidate_words def make_contents(doc): """ Returns the contents of a document as a single string. """ if "title" in doc and "contents" in doc: return doc["title"] + " " + doc["contents"] if "headline" in doc and "text" in doc: return doc["headline"] + " " + doc["text"] if "title" in doc and "text" in doc: return doc["title"] + " " + doc["text"] if "contents" in doc: return doc["contents"] if "text" in doc: return doc["text"] def main(args): # read in the qrels and docs file from the `args.dataset` directory for the `.relevant_only` files base_dir = os.path.join("data", args.dataset) qrels = pd.read_csv(os.path.join(base_dir, "qrels.relevant_only.trec"), sep="\t", header=None, names=["qid", "docid", "rel"]) docs = pd.read_json(os.path.join(base_dir, "docs.relevant_only.jsonl"), lines=True) for qid in qrels.groupby("qid").groups.keys(): # get the relevant documents for the current query relevant_docids = qrels[qrels["qid"] == qid]["docid"].tolist() # get the text for the relevant documents relevant_docs_text = docs[docs["doc_id"].isin(relevant_docids)].apply(lambda x: make_contents(x), axis=1).tolist() splitting_words = find_dividing_words(relevant_docs_text) breakpoint() if __name__ == "__main__": parser = argparse.ArgumentParser(description='Find words that might split the set of documents into two groups.') parser.add_argument('dataset', type=str, help='The dataset to use (e.g. "robust04")') args = parser.parse_args() main(args)