cnmoro commited on
Commit
dc21de1
1 Parent(s): 117007f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -4
app.py CHANGED
@@ -27,7 +27,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
27
  return cosine_similarity([embed1], [embed2])[0][0]
28
 
29
  def create_lda_model(texts, stopwords):
30
- vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
31
  doc_term_matrix = vectorizer.fit_transform(texts)
32
  lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
33
  lda.fit(doc_term_matrix)
@@ -49,12 +49,17 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
49
  unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
50
  lexical_diversity = len(unique_words) / len(words) if words else 0
51
 
52
- # Combine factors (you can adjust weights as needed)
53
- importance = (0.4 * semantic_similarity) + (0.4 * topic_importance) + (0.2 * lexical_diversity)
54
  return importance
55
 
56
  # Split the text into sentences
57
  sentences = sent_tokenize(full_text)
 
 
 
 
 
58
 
59
  text_lang = detect_language(full_text)
60
 
@@ -91,7 +96,6 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
91
 
92
  return ' '.join(compressed_text)
93
 
94
-
95
  async def predict(text, word_reduction_factor):
96
  if len(text.split()) > 5000:
97
  return "Text is too long for this demo. Please provide a text with less than 5000 words."
 
27
  return cosine_similarity([embed1], [embed2])[0][0]
28
 
29
  def create_lda_model(texts, stopwords):
30
+ vectorizer = CountVectorizer(stop_words=stopwords)
31
  doc_term_matrix = vectorizer.fit_transform(texts)
32
  lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
33
  lda.fit(doc_term_matrix)
 
49
  unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
50
  lexical_diversity = len(unique_words) / len(words) if words else 0
51
 
52
+ # Combine factors
53
+ importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
54
  return importance
55
 
56
  # Split the text into sentences
57
  sentences = sent_tokenize(full_text)
58
+ final_sentences = []
59
+ for s in sentences:
60
+ broken_sentences = s.split('\n')
61
+ final_sentences.extend(broken_sentences)
62
+ sentences = final_sentences
63
 
64
  text_lang = detect_language(full_text)
65
 
 
96
 
97
  return ' '.join(compressed_text)
98
 
 
99
  async def predict(text, word_reduction_factor):
100
  if len(text.split()) > 5000:
101
  return "Text is too long for this demo. Please provide a text with less than 5000 words."