Spaces:

cnmoro
/

SemanticCompression

Running

cnmoro commited on 24 days ago

Commit

dc21de1

•

1 Parent(s): 117007f

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
         return cosine_similarity([embed1], [embed2])[0][0]
     def create_lda_model(texts, stopwords):
-        vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stopwords)
         doc_term_matrix = vectorizer.fit_transform(texts)
         lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
         lda.fit(doc_term_matrix)
@@ -49,12 +49,17 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
         unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
         lexical_diversity = len(unique_words) / len(words) if words else 0
-        # Combine factors (you can adjust weights as needed)
-        importance = (0.4 * semantic_similarity) + (0.4 * topic_importance) + (0.2 * lexical_diversity)
         return importance
     # Split the text into sentences
     sentences = sent_tokenize(full_text)
     text_lang = detect_language(full_text)
@@ -91,7 +96,6 @@ def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
     return ' '.join(compressed_text)
 async def predict(text, word_reduction_factor):
     if len(text.split()) > 5000:
         return "Text is too long for this demo. Please provide a text with less than 5000 words."

         return cosine_similarity([embed1], [embed2])[0][0]
     def create_lda_model(texts, stopwords):
+        vectorizer = CountVectorizer(stop_words=stopwords)
         doc_term_matrix = vectorizer.fit_transform(texts)
         lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
         lda.fit(doc_term_matrix)
         unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
         lexical_diversity = len(unique_words) / len(words) if words else 0
+        # Combine factors
+        importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
         return importance
     # Split the text into sentences
     sentences = sent_tokenize(full_text)
+    final_sentences = []
+    for s in sentences:
+        broken_sentences = s.split('\n')
+        final_sentences.extend(broken_sentences)
+    sentences = final_sentences
     text_lang = detect_language(full_text)
     return ' '.join(compressed_text)
 async def predict(text, word_reduction_factor):
     if len(text.split()) > 5000:
         return "Text is too long for this demo. Please provide a text with less than 5000 words."