orionweller commited on
Commit
8bfed60
1 Parent(s): 3c28932

move to avoid import

Browse files
Files changed (2) hide show
  1. app.py +67 -1
  2. find_splitting_words.py +0 -103
app.py CHANGED
@@ -9,8 +9,74 @@ import re
9
  import tqdm
10
  import plotly.express as px
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from dataset_loading import load_local_qrels, load_local_corpus, load_local_queries
13
- from find_splitting_words import find_dividing_words
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
 
9
  import tqdm
10
  import plotly.express as px
11
 
12
+
13
+ import pandas as pd
14
+ from nltk.corpus import stopwords
15
+ from nltk.stem import PorterStemmer
16
+ from nltk.tokenize import word_tokenize
17
+ from collections import Counter
18
+ import string
19
+ import os
20
+ import streamlit as st
21
+
22
+ # Ensure you've downloaded the set of stop words the first time you run this
23
+ import nltk
24
+ # only download if they don't exist
25
+ if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')):
26
+ nltk.download('punkt')
27
+ nltk.download('stopwords')
28
+
29
+
30
  from dataset_loading import load_local_qrels, load_local_corpus, load_local_queries
31
+
32
+
33
+ def preprocess_document(doc):
34
+ """
35
+ Tokenizes, removes punctuation, stopwords, and stems words in a single document.
36
+ """
37
+ # Lowercase
38
+ doc = doc.lower()
39
+ # Remove punctuation
40
+ doc = doc.translate(str.maketrans('', '', string.punctuation))
41
+ # Tokenize
42
+ tokens = word_tokenize(doc)
43
+ # Remove stop words
44
+ stop_words = set(stopwords.words('english'))
45
+ filtered_tokens = [word for word in tokens if word not in stop_words]
46
+ # Stemming
47
+ stemmer = PorterStemmer()
48
+ stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
49
+ return stemmed_tokens
50
+
51
+ @st.cache_data
52
+ def find_dividing_words(documents):
53
+ """
54
+ Identifies candidate words that might split the set of documents into two groups.
55
+ """
56
+ all_words = []
57
+ per_doc_word_counts = []
58
+
59
+ i = 0
60
+ for doc in documents:
61
+ print(i)
62
+ preprocessed_doc = preprocess_document(doc)
63
+ all_words.extend(preprocessed_doc)
64
+ per_doc_word_counts.append(Counter(preprocessed_doc))
65
+ i += 1
66
+
67
+ # Overall word frequency
68
+ overall_word_counts = Counter(all_words)
69
+
70
+ # Find words that appear in roughly half the documents
71
+ num_docs = len(documents)
72
+ candidate_words = []
73
+ for word, count in overall_word_counts.items():
74
+ doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
75
+ if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs:
76
+ candidate_words.append(word)
77
+ print("Done with dividing words")
78
+
79
+ return candidate_words
80
 
81
 
82
  os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
find_splitting_words.py DELETED
@@ -1,103 +0,0 @@
1
- import argparse
2
- import pandas as pd
3
- from nltk.corpus import stopwords
4
- from nltk.stem import PorterStemmer
5
- from nltk.tokenize import word_tokenize
6
- from collections import Counter
7
- import string
8
- import os
9
- import streamlit as st
10
-
11
- # Ensure you've downloaded the set of stop words the first time you run this
12
- import nltk
13
- # only download if they don't exist
14
- if not os.path.exists(os.path.join(nltk.data.find('corpora'), 'stopwords')):
15
- nltk.download('punkt')
16
- nltk.download('stopwords')
17
-
18
- def preprocess_document(doc):
19
- """
20
- Tokenizes, removes punctuation, stopwords, and stems words in a single document.
21
- """
22
- # Lowercase
23
- doc = doc.lower()
24
- # Remove punctuation
25
- doc = doc.translate(str.maketrans('', '', string.punctuation))
26
- # Tokenize
27
- tokens = word_tokenize(doc)
28
- # Remove stop words
29
- stop_words = set(stopwords.words('english'))
30
- filtered_tokens = [word for word in tokens if word not in stop_words]
31
- # Stemming
32
- stemmer = PorterStemmer()
33
- stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
34
- return stemmed_tokens
35
-
36
- @st.cache_data
37
- def find_dividing_words(documents):
38
- """
39
- Identifies candidate words that might split the set of documents into two groups.
40
- """
41
- all_words = []
42
- per_doc_word_counts = []
43
-
44
- i = 0
45
- for doc in documents:
46
- print(i)
47
- preprocessed_doc = preprocess_document(doc)
48
- all_words.extend(preprocessed_doc)
49
- per_doc_word_counts.append(Counter(preprocessed_doc))
50
- i += 1
51
-
52
- # Overall word frequency
53
- overall_word_counts = Counter(all_words)
54
-
55
- # Find words that appear in roughly half the documents
56
- num_docs = len(documents)
57
- candidate_words = []
58
- for word, count in overall_word_counts.items():
59
- doc_frequency = sum(1 for doc_count in per_doc_word_counts if doc_count[word] > 0)
60
- if 0.35 * num_docs <= doc_frequency <= 0.75 * num_docs:
61
- candidate_words.append(word)
62
- print("Done with dividing words")
63
-
64
- return candidate_words
65
-
66
-
67
- def make_contents(doc):
68
- """
69
- Returns the contents of a document as a single string.
70
- """
71
- if "title" in doc and "contents" in doc:
72
- return doc["title"] + " " + doc["contents"]
73
- if "headline" in doc and "text" in doc:
74
- return doc["headline"] + " " + doc["text"]
75
- if "title" in doc and "text" in doc:
76
- return doc["title"] + " " + doc["text"]
77
- if "contents" in doc:
78
- return doc["contents"]
79
- if "text" in doc:
80
- return doc["text"]
81
-
82
-
83
- def main(args):
84
- # read in the qrels and docs file from the `args.dataset` directory for the `.relevant_only` files
85
- base_dir = os.path.join("data", args.dataset)
86
- qrels = pd.read_csv(os.path.join(base_dir, "qrels.relevant_only.trec"), sep="\t", header=None, names=["qid", "docid", "rel"])
87
- docs = pd.read_json(os.path.join(base_dir, "docs.relevant_only.jsonl"), lines=True)
88
-
89
- for qid in qrels.groupby("qid").groups.keys():
90
- # get the relevant documents for the current query
91
- relevant_docids = qrels[qrels["qid"] == qid]["docid"].tolist()
92
- # get the text for the relevant documents
93
- relevant_docs_text = docs[docs["doc_id"].isin(relevant_docids)].apply(lambda x: make_contents(x), axis=1).tolist()
94
- splitting_words = find_dividing_words(relevant_docs_text)
95
-
96
- breakpoint()
97
-
98
-
99
- if __name__ == "__main__":
100
- parser = argparse.ArgumentParser(description='Find words that might split the set of documents into two groups.')
101
- parser.add_argument('dataset', type=str, help='The dataset to use (e.g. "robust04")')
102
- args = parser.parse_args()
103
- main(args)