Keane Moraes commited on
Commit
d87b50e
1 Parent(s): 28e14c5

clustering works

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. .vscode/settings.json +24 -0
  3. clustering.py +2 -0
  4. topics.py +14 -22
  5. utils.py +51 -10
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  /__pycache__*
2
- recursive-exclude * *.py[co]
 
 
1
  /__pycache__*
2
+ recursive-exclude * *.py[co]
3
+ /.vscode*
.vscode/settings.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.tokenColorCustomizations": {
3
+ "textMateRules": [
4
+ {
5
+ "scope": "googletest.failed",
6
+ "settings": {
7
+ "foreground": "#f00"
8
+ }
9
+ },
10
+ {
11
+ "scope": "googletest.passed",
12
+ "settings": {
13
+ "foreground": "#0f0"
14
+ }
15
+ },
16
+ {
17
+ "scope": "googletest.run",
18
+ "settings": {
19
+ "foreground": "#0f0"
20
+ }
21
+ }
22
+ ]
23
+ }
24
+ }
clustering.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import spacy
2
+ import pandas as pd
topics.py CHANGED
@@ -1,37 +1,29 @@
1
  import openai
2
  from utils import *
3
- import mdforest
4
- import pandas as pd
5
- import spacy
6
 
7
- class Insights:
8
 
9
  EMBEDDING_MAX_TOKENS = 1023
10
 
11
  def __init__(self, text:str) -> None:
12
- cleaned_text = mdforest.clean_markdown(text)
13
  self.keywords = []
14
-
15
- self.corpus = preprocess(cleaned_text)
16
- self.text = create_nest_sentences(self.corpus, self.EMBEDDING_MAX_TOKENS)
17
  self.model = load_keyword_model()
18
- self.embedder = load_embedder()
19
-
20
 
21
  def generate_topics(self) -> list:
22
- for sentence in self.text:
23
- self.keywords = self.keywords + generate_keywords(self.model, sentence)
24
- return self.keywords
 
 
 
 
 
 
 
 
25
 
26
- def generate_embeddings(self) -> list:
27
- # generate embeddings for all the sentences
28
- nlp = spacy.load("en_core_web_sm")
29
- final_embeddings = []
30
- for text in self.text:
31
- print(text[0])
32
- doc = nlp(text[0])
33
- sentence_embeddings = [sent.vector for sent in doc.sents]
34
- final_embeddings += sentence_embeddings
35
 
36
 
37
 
 
1
  import openai
2
  from utils import *
 
 
 
3
 
4
+ class TopicModelling:
5
 
6
  EMBEDDING_MAX_TOKENS = 1023
7
 
8
  def __init__(self, text:str) -> None:
 
9
  self.keywords = []
10
+ self.corpus = text
11
+ # self.text = create_nest_sentences(self.corpus, self.EMBEDDING_MAX_TOKENS)
 
12
  self.model = load_keyword_model()
 
 
13
 
14
  def generate_topics(self) -> list:
15
+
16
+ keywords = self.model.extract_keywords(self.corpus, keyphrase_ngram_range=(1, 1), stop_words=None)
17
+ topics = self.model.extract_keywords(self.corpus, keyphrase_ngram_range=(1, 2), stop_words=None)
18
+ keywords = [kw[0] for kw in keywords] + [kw[0] for kw in topics]
19
+ concepts = self.model.extract_keywords(self.corpus, keyphrase_ngram_range=(3, 3), stop_words='english', top_n=5)
20
+ concepts = [kw[0] for kw in concepts]
21
+
22
+ return keywords, concepts
23
+
24
+
25
+
26
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
 
utils.py CHANGED
@@ -4,12 +4,11 @@ from nltk.corpus import stopwords
4
  from transformers import AutoTokenizer
5
  import re
6
  import spacy
 
 
7
  from sentence_transformers import SentenceTransformer
8
 
9
- # @st.cache_data
10
- # def load_nlp():
11
- # nlp =
12
-
13
 
14
  @st.cache_data
15
  def load_autotoken():
@@ -18,12 +17,13 @@ def load_autotoken():
18
 
19
  @st.cache_data
20
  def load_keyword_model():
21
- kw_model = KeyBERT()
22
- return kw_model
 
23
 
24
  @st.cache_data
25
- def load_embedder():
26
- embedder = SentenceTransformer('all-MiniLM-L6-v2')
27
  return embedder
28
 
29
  def create_nest_sentences(document:str, token_max_length = 1023):
@@ -32,7 +32,7 @@ def create_nest_sentences(document:str, token_max_length = 1023):
32
  length = 0
33
  tokenizer = load_autotoken()
34
 
35
- for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
36
  tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
37
  length += len(tokens_in_sentence)
38
 
@@ -51,7 +51,7 @@ def preprocess(text) -> str:
51
  stop_words = set(stopwords.words("english"))
52
  text = text.lower()
53
  text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
54
- words = text.split()
55
  words = [w for w in words if not w in stop_words]
56
  return " ".join(words)
57
 
@@ -64,3 +64,44 @@ def generate_keywords(kw_model, document: str) -> list:
64
  for extraction in complex_extractions:
65
  final_topics.append(extraction[0])
66
  return final_topics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from transformers import AutoTokenizer
5
  import re
6
  import spacy
7
+ from sklearn.cluster import KMeans, AgglomerativeClustering
8
+ import numpy as np
9
  from sentence_transformers import SentenceTransformer
10
 
11
+ MODEL = 'all-MiniLM-L6-v2'
 
 
 
12
 
13
  @st.cache_data
14
  def load_autotoken():
 
17
 
18
  @st.cache_data
19
  def load_keyword_model():
20
+ sentence_model = load_model()
21
+ kw_model = KeyBERT(model=sentence_model)
22
+ return kw_model
23
 
24
  @st.cache_data
25
+ def load_model():
26
+ embedder = SentenceTransformer(MODEL)
27
  return embedder
28
 
29
  def create_nest_sentences(document:str, token_max_length = 1023):
 
32
  length = 0
33
  tokenizer = load_autotoken()
34
 
35
+ for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", '.')):
36
  tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
37
  length += len(tokens_in_sentence)
38
 
 
51
  stop_words = set(stopwords.words("english"))
52
  text = text.lower()
53
  text = ''.join([c for c in text if c not in ('!', '.', ',', '?', ':', ';', '"', "'", '-', '(', ')')])
54
+ words = text.split()
55
  words = [w for w in words if not w in stop_words]
56
  return " ".join(words)
57
 
 
64
  for extraction in complex_extractions:
65
  final_topics.append(extraction[0])
66
  return final_topics
67
+
68
+ def cluster_based_on_topics(embedder, text1:str, text2:str, num_clusters:int = 2):
69
+ nlp = spacy.load("en_core_web_sm")
70
+
71
+ # Preprocess and tokenize the texts
72
+ doc1 = nlp(preprocess(text1))
73
+ doc2 = nlp(preprocess(text2))
74
+
75
+ # Extract sentences from the texts
76
+ sentences1 = [sent.text for sent in doc1.sents]
77
+ sentences2 = [sent.text for sent in doc2.sents]
78
+ all_sentences = sentences1 + sentences2
79
+
80
+ with open('insight1_sent.txt', 'w') as f:
81
+ for item in sentences1:
82
+ f.write("%s\n" % item)
83
+
84
+ with open('insight2_sent.txt', 'w') as f:
85
+ for item in sentences2:
86
+ f.write("%s\n" % item)
87
+
88
+ # Generate sentence embeddings for each sentence
89
+ sentence_embeddings1 = embedder.encode(sentences1)
90
+ sentence_embeddings2 = embedder.encode(sentences2)
91
+ all_embeddings = np.concatenate((sentence_embeddings1, sentence_embeddings2), axis=0)
92
+
93
+ # Normalize the embeddings to unit length
94
+ all_embeddings = all_embeddings / np.linalg.norm(all_embeddings, axis=1, keepdims=True)
95
+
96
+ # Perform kmean clustering
97
+ clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
98
+ clustering_model.fit(all_embeddings)
99
+ cluster_assignment = clustering_model.labels_
100
+
101
+ clustered_sentences = {}
102
+ for sentence_id, cluster_id in enumerate(cluster_assignment):
103
+ if cluster_id not in clustered_sentences:
104
+ clustered_sentences[cluster_id] = []
105
+ clustered_sentences[cluster_id].append(all_sentences[sentence_id])
106
+
107
+ return clustered_sentences