KevSun commited on
Commit
5761cdf
1 Parent(s): c2795c9

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from sentence_transformers import SentenceTransformer, util
3
+ from sklearn.decomposition import LatentDirichletAllocation
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.manifold import TSNE
6
+ from langdetect import detect, DetectorFactory
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+ import pandas as pd
10
+
11
+ DetectorFactory.seed = 0
12
+
13
+ # Load models for embedding and similarity
14
+ multi_embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
15
+
16
+ class WordEmbeddingAgent:
17
+ def __init__(self, model):
18
+ self.model = model
19
+
20
+ def get_embeddings(self, words):
21
+ return self.model.encode(words)
22
+
23
+ class SimilarityAgent:
24
+ def __init__(self, model):
25
+ self.model = model
26
+
27
+ def compute_similarity(self, text1, text2):
28
+ embedding1 = self.model.encode(text1, convert_to_tensor=True)
29
+ embedding2 = self.model.encode(text2, convert_to_tensor=True)
30
+ return util.pytorch_cos_sim(embedding1, embedding2).item()
31
+
32
+ class TopicModelingAgent:
33
+ def __init__(self, n_components=10):
34
+ self.lda_model = LatentDirichletAllocation(n_components=n_components, random_state=42)
35
+
36
+ def fit_transform(self, texts, lang):
37
+ stop_words = 'english' if lang == 'en' else 'spanish'
38
+ vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words=stop_words)
39
+ dtm = vectorizer.fit_transform(texts)
40
+ self.lda_model.fit(dtm)
41
+ return self.lda_model.transform(dtm), vectorizer
42
+
43
+ def get_topics(self, vectorizer, num_words=10):
44
+ topics = {}
45
+ for idx, topic in enumerate(self.lda_model.components_):
46
+ topics[idx] = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]]
47
+ return topics
48
+
49
+ def detect_language(text):
50
+ try:
51
+ return detect(text)
52
+ except:
53
+ return "unknown"
54
+
55
+ def tsne_visualization(embeddings, words):
56
+ tsne = TSNE(n_components=2, random_state=42)
57
+ embeddings_2d = tsne.fit_transform(embeddings)
58
+ df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
59
+ df['word'] = words
60
+ return df
61
+
62
+ def main():
63
+ st.title("Multilingual Text Analysis System")
64
+ user_input = st.text_area("Enter your text here:")
65
+
66
+ if st.button("Analyze"):
67
+ if user_input:
68
+ lang = detect_language(user_input)
69
+ st.write(f"Detected language: {lang}")
70
+
71
+ embedding_agent = WordEmbeddingAgent(multi_embedding_model)
72
+ similarity_agent = SimilarityAgent(multi_embedding_model)
73
+ topic_modeling_agent = TopicModelingAgent()
74
+
75
+ # Tokenize the input text into words
76
+ words = user_input.split()
77
+
78
+ # Generate Embeddings
79
+ embeddings = embedding_agent.get_embeddings(words)
80
+ st.write("Word Embeddings Generated.")
81
+
82
+ # t-SNE Visualization
83
+ tsne_df = tsne_visualization(embeddings, words)
84
+ fig, ax = plt.subplots()
85
+ ax.scatter(tsne_df['x'], tsne_df['y'])
86
+
87
+ for i, word in enumerate(tsne_df['word']):
88
+ ax.annotate(word, (tsne_df['x'][i], tsne_df['y'][i]))
89
+
90
+ st.pyplot(fig)
91
+
92
+ # Topic Modeling
93
+ texts = [user_input, "Another text to improve topic modeling."]
94
+ topic_distr, vectorizer = topic_modeling_agent.fit_transform(texts, lang)
95
+ topics = topic_modeling_agent.get_topics(vectorizer)
96
+ st.write("Topics Extracted:")
97
+ for topic, words in topics.items():
98
+ st.write(f"Topic {topic}: {', '.join(words)}")
99
+
100
+ # Sentence Similarity (example with another text)
101
+ text2 = "Otro texto de ejemplo para comparación de similitud." if lang != 'en' else "Another example text for similarity comparison."
102
+ similarity_score = similarity_agent.compute_similarity(user_input, text2)
103
+ st.write(f"Similarity Score with example text: {similarity_score:.4f}")
104
+
105
+ else:
106
+ st.warning("Please enter some text to analyze.")
107
+
108
+ if __name__ == "__main__":
109
+ main()
110
+