nevisende commited on
Commit
12c6dc9
·
1 Parent(s): 2511227

Feat: make response sentence base

Browse files
Files changed (3) hide show
  1. .idea/.gitignore +8 -0
  2. .idea/word-analyzer.iml +12 -0
  3. app.py +52 -51
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/word-analyzer.iml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="jdk" jdkName="hf2" jdkType="Python SDK" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ <component name="PyDocumentationSettings">
9
+ <option name="format" value="PLAIN" />
10
+ <option name="myDocStringFormat" value="Plain" />
11
+ </component>
12
+ </module>
app.py CHANGED
@@ -13,6 +13,7 @@ from nltk.tokenize import word_tokenize
13
  from nltk.tag import pos_tag
14
  import gradio as gr
15
 
 
16
 
17
  load_dotenv()
18
  # Configuration
@@ -42,6 +43,7 @@ except IOError:
42
  spacy.cli.download(CONFIG['SPACY_MODEL'])
43
  nlp = spacy.load(CONFIG['SPACY_MODEL'])
44
 
 
45
  def get_wordnet_pos(treebank_tag):
46
  """Map POS tag to first character used by WordNet."""
47
  tag_map = {
@@ -49,6 +51,7 @@ def get_wordnet_pos(treebank_tag):
49
  }
50
  return tag_map.get(treebank_tag[0], None)
51
 
 
52
  def lesk_algorithm(word, sentence, pos=None):
53
  """Implement the Lesk algorithm for word sense disambiguation."""
54
  word = word.lower()
@@ -69,79 +72,76 @@ def lesk_algorithm(word, sentence, pos=None):
69
 
70
  return best_sense
71
 
72
- def create_unique_index(word, meaning, sentence):
 
73
  """Create a unique index for each word-meaning pair."""
74
- combined = f"{word}_{meaning}_{sentence}".encode('utf-8')
75
  return hashlib.md5(combined).hexdigest()
76
 
 
77
  def is_meaningful_word(token):
78
  """Check if a word is meaningful and should be included in the analysis."""
79
- return (token.has_vector and # This ensures the word is in spaCy's vocabulary
80
- not token.is_stop and # Exclude stop words
81
  token.pos_ not in ['PUNCT', 'SYM', 'X'] and # Exclude punctuation, symbols, and other
82
  len(token.text) > 1) # Exclude single-character tokens
83
 
 
 
 
84
  def process_sentence(sent):
85
- """Process a single sentence and return word information."""
86
- word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
87
  doc = nlp(sent)
88
-
89
  for token in doc:
90
- if is_meaningful_word(token):
 
 
 
 
 
 
 
 
 
 
91
  word = token.text.lower()
92
  wordnet_pos = get_wordnet_pos(token.tag_)
93
-
94
- if not word_info[word]["lemma"]:
95
- word_info[word]["lemma"] = token.lemma_
96
-
97
  best_sense = lesk_algorithm(word, sent, wordnet_pos)
98
-
99
  if best_sense:
100
  definition = best_sense.definition()
101
  pos = best_sense.pos()
102
-
103
- unique_index = create_unique_index(word, definition, sent)
104
-
105
- new_meaning = {
 
 
106
  "index": unique_index,
107
  "meaning": definition,
108
- "POS": pos,
109
- "sentence": sent
110
- }
111
-
112
- if not any(m['meaning'] == definition for m in word_info[word]["meanings"]):
113
- word_info[word]["meanings"].append(new_meaning)
114
-
115
- return dict(word_info)
 
 
116
 
117
  def get_word_info(text):
118
- """Get word information for all sentences in the text."""
119
  sentences = nltk.sent_tokenize(text)
120
- word_info = defaultdict(lambda: {"lemma": "", "meanings": []})
121
-
122
- with ProcessPoolExecutor() as executor:
123
- future_to_sentence = {executor.submit(process_sentence, sent): sent for sent in sentences}
124
- for future in as_completed(future_to_sentence):
125
- sentence_info = future.result()
126
- for word, info in sentence_info.items():
127
- word_info[word]["lemma"] = info["lemma"]
128
- word_info[word]["meanings"].extend(info["meanings"])
129
-
130
- # If a word has no meanings, try to get a default definition
131
- for word, info in word_info.items():
132
- if not info["meanings"]:
133
- synsets = wn.synsets(word)
134
- if synsets:
135
- definition = synsets[0].definition()
136
- pos = synsets[0].pos()
137
- info["meanings"].append({
138
- "index": create_unique_index(word, definition, ""),
139
- "meaning": definition,
140
- "POS": pos,
141
- "sentence": "Default definition"
142
- })
143
-
144
- return dict(word_info)
145
 
146
  def process_text(selected_text, user_text):
147
  """Process the input text and return JSON results."""
@@ -153,6 +153,7 @@ def process_text(selected_text, user_text):
153
  logger.error(f"Error processing text: {str(e)}")
154
  return json.dumps({"error": "An error occurred while processing the text."})
155
 
 
156
  # Sample texts
157
  examples = [
158
  "The chef will season the steak with salt and pepper before grilling. Pumpkin spice lattes usually season the arrival of autumn.",
 
13
  from nltk.tag import pos_tag
14
  import gradio as gr
15
 
16
+ nltk.download('punkt_tab')
17
 
18
  load_dotenv()
19
  # Configuration
 
43
  spacy.cli.download(CONFIG['SPACY_MODEL'])
44
  nlp = spacy.load(CONFIG['SPACY_MODEL'])
45
 
46
+
47
  def get_wordnet_pos(treebank_tag):
48
  """Map POS tag to first character used by WordNet."""
49
  tag_map = {
 
51
  }
52
  return tag_map.get(treebank_tag[0], None)
53
 
54
+
55
  def lesk_algorithm(word, sentence, pos=None):
56
  """Implement the Lesk algorithm for word sense disambiguation."""
57
  word = word.lower()
 
72
 
73
  return best_sense
74
 
75
+
76
+ def create_unique_index(word, meaning):
77
  """Create a unique index for each word-meaning pair."""
78
+ combined = f"{word}_{meaning}".encode('utf-8')
79
  return hashlib.md5(combined).hexdigest()
80
 
81
+
82
  def is_meaningful_word(token):
83
  """Check if a word is meaningful and should be included in the analysis."""
84
+ return (not token.is_stop and # Exclude stop words
 
85
  token.pos_ not in ['PUNCT', 'SYM', 'X'] and # Exclude punctuation, symbols, and other
86
  len(token.text) > 1) # Exclude single-character tokens
87
 
88
+
89
+
90
+
91
  def process_sentence(sent):
92
+ """Process a single sentence and return word information in order."""
93
+ word_info = []
94
  doc = nlp(sent)
95
+
96
  for token in doc:
97
+ if token.is_punct:
98
+ word_info.append({
99
+ "original": token.text,
100
+ "type": "punctuation"
101
+ })
102
+ elif token.is_space:
103
+ word_info.append({
104
+ "original": token.text,
105
+ "type": "space"
106
+ })
107
+ else:
108
  word = token.text.lower()
109
  wordnet_pos = get_wordnet_pos(token.tag_)
110
+
 
 
 
111
  best_sense = lesk_algorithm(word, sent, wordnet_pos)
112
+
113
  if best_sense:
114
  definition = best_sense.definition()
115
  pos = best_sense.pos()
116
+
117
+ unique_index = create_unique_index(word, definition)
118
+
119
+ word_info.append({
120
+ "original": token.text,
121
+ "lemma": token.lemma_,
122
  "index": unique_index,
123
  "meaning": definition,
124
+ "POS": pos
125
+ })
126
+ else:
127
+ word_info.append({
128
+ "original": token.text,
129
+ "type": "unknown"
130
+ })
131
+
132
+ return word_info
133
+
134
 
135
  def get_word_info(text):
136
+ """Get word information for all sentences in the text, preserving sentence structure."""
137
  sentences = nltk.sent_tokenize(text)
138
+ all_word_info = []
139
+
140
+ for sent in sentences:
141
+ sentence_info = process_sentence(sent)
142
+ all_word_info.append(sentence_info)
143
+
144
+ return all_word_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  def process_text(selected_text, user_text):
147
  """Process the input text and return JSON results."""
 
153
  logger.error(f"Error processing text: {str(e)}")
154
  return json.dumps({"error": "An error occurred while processing the text."})
155
 
156
+
157
  # Sample texts
158
  examples = [
159
  "The chef will season the steak with salt and pepper before grilling. Pumpkin spice lattes usually season the arrival of autumn.",