Spaces:
Sleeping
Sleeping
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import numpy as np | |
| import nltk | |
| from collections import Counter | |
| import networkx as nx | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| from sklearn.decomposition import LatentDirichletAllocation, NMF | |
| import wordcloud | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| import matplotlib.colors as mcolors | |
| import io | |
| import base64 | |
| from utils.model_loader import download_nltk_resources | |
| from utils.helpers import fig_to_html, df_to_html_table | |
| def classify_topic(text_input): | |
| """Classify the topic of the text into predefined categories.""" | |
| # Define topic keywords | |
| topic_keywords = { | |
| 'environment': ['climate', 'environment', 'weather', 'earth', 'temperature', 'pollution', 'warming', 'planet', 'ecosystem', 'sustainable'], | |
| 'science': ['science', 'scientific', 'research', 'study', 'experiment', 'discovery', 'theory', 'laboratory', 'data'], | |
| 'business': ['business', 'company', 'market', 'economy', 'economic', 'finance', 'industry', 'corporate', 'trade'], | |
| 'education': ['education', 'school', 'student', 'learn', 'teach', 'academic', 'university', 'college', 'knowledge'], | |
| 'health': ['health', 'medical', 'doctor', 'patient', 'disease', 'treatment', 'hospital', 'medicine', 'healthcare'], | |
| 'technology': ['technology', 'tech', 'computer', 'digital', 'software', 'hardware', 'internet', 'device', 'innovation'], | |
| 'politics': ['politics', 'government', 'policy', 'election', 'political', 'law', 'president', 'party', 'vote'], | |
| 'sports': ['sport', 'game', 'team', 'player', 'competition', 'athlete', 'championship', 'tournament', 'coach'], | |
| 'entertainment': ['entertainment', 'movie', 'music', 'film', 'television', 'celebrity', 'actor', 'actress', 'show'], | |
| 'travel': ['travel', 'trip', 'vacation', 'tourist', 'destination', 'journey', 'adventure', 'flight', 'hotel'] | |
| } | |
| # Convert text to lowercase | |
| text = text_input.lower() | |
| # Count keyword occurrences for each topic | |
| topic_scores = {} | |
| for topic, keywords in topic_keywords.items(): | |
| score = 0 | |
| for keyword in keywords: | |
| # Count occurrences of the keyword | |
| count = text.count(keyword) | |
| # Add to the topic score | |
| score += count | |
| # Store the normalized score | |
| topic_scores[topic] = score / (len(text.split()) + 0.001) # Normalize by text length | |
| # Get the main topic and confidence | |
| main_topic = max(topic_scores.items(), key=lambda x: x[1]) | |
| total_score = sum(topic_scores.values()) + 0.001 # Avoid division by zero | |
| confidence = main_topic[1] / total_score if total_score > 0 else 0 | |
| confidence = round(confidence * 100, 1) # Convert to percentage | |
| # Sort topics by score for visualization | |
| sorted_topics = sorted(topic_scores.items(), key=lambda x: x[1], reverse=True) | |
| return main_topic[0], confidence, sorted_topics, topic_scores | |
| def extract_key_phrases(text_input, top_n=10): | |
| """Extract key phrases from text.""" | |
| # Download required NLTK resources | |
| download_nltk_resources() | |
| # Define stop words | |
| stop_words = set(stopwords.words('english')) | |
| # Tokenize into sentences | |
| sentences = nltk.sent_tokenize(text_input) | |
| # Extract 2-3 word phrases (n-grams) | |
| phrases = [] | |
| # Get bigrams | |
| bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english', max_features=100) | |
| try: | |
| bigram_matrix = bigram_vectorizer.fit_transform([text_input]) | |
| bigram_features = bigram_vectorizer.get_feature_names_out() | |
| bigram_scores = bigram_matrix.toarray()[0] | |
| for phrase, score in zip(bigram_features, bigram_scores): | |
| if score >= 1: # Must appear at least once | |
| phrases.append((phrase, int(score))) | |
| except: | |
| pass # Handle potential errors | |
| # Get trigrams | |
| trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), stop_words='english', max_features=100) | |
| try: | |
| trigram_matrix = trigram_vectorizer.fit_transform([text_input]) | |
| trigram_features = trigram_vectorizer.get_feature_names_out() | |
| trigram_scores = trigram_matrix.toarray()[0] | |
| for phrase, score in zip(trigram_features, trigram_scores): | |
| if score >= 1: # Must appear at least once | |
| phrases.append((phrase, int(score))) | |
| except: | |
| pass | |
| # Also extract single important words (nouns, verbs, adjectives) | |
| words = word_tokenize(text_input) | |
| pos_tags = nltk.pos_tag(words) | |
| important_words = [] | |
| for word, tag in pos_tags: | |
| # Only consider nouns, verbs, and adjectives | |
| if (tag.startswith('NN') or tag.startswith('VB') or tag.startswith('JJ')) and word.lower() not in stop_words and len(word) > 2: | |
| important_words.append(word.lower()) | |
| # Count word frequencies | |
| word_freq = Counter(important_words) | |
| # Add important single words to phrases | |
| for word, freq in word_freq.most_common(top_n): | |
| if freq >= 1: | |
| phrases.append((word, freq)) | |
| # Sort phrases by frequency | |
| sorted_phrases = sorted(phrases, key=lambda x: x[1], reverse=True) | |
| # Return top N phrases | |
| return sorted_phrases[:top_n] | |
| def create_phrase_cloud(phrases): | |
| """Create a word cloud from phrases.""" | |
| # Convert phrases to a dictionary of {phrase: frequency} | |
| phrase_freq = {phrase: freq for phrase, freq in phrases} | |
| # Create word cloud | |
| wc = wordcloud.WordCloud( | |
| background_color='white', | |
| width=600, | |
| height=400, | |
| colormap='viridis', | |
| max_words=50, | |
| prefer_horizontal=0.9, | |
| random_state=42 | |
| ) | |
| try: | |
| # Generate word cloud from phrases | |
| wc.generate_from_frequencies(phrase_freq) | |
| # Create figure | |
| fig = plt.figure(figsize=(10, 6)) | |
| plt.imshow(wc, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.tight_layout() | |
| return fig_to_html(fig) | |
| except: | |
| return "<p>Could not generate phrase cloud due to insufficient data.</p>" | |
| def topic_analysis_handler(text_input): | |
| """Show topic analysis capabilities.""" | |
| output_html = [] | |
| # Add result area container | |
| output_html.append('<div class="result-area">') | |
| output_html.append('<h2 class="task-header">Topic Analysis</h2>') | |
| output_html.append(""" | |
| <div class="alert alert-info"> | |
| <i class="fas fa-info-circle"></i> | |
| Topic analysis identifies the main themes and subjects in a text, helping to categorize content and understand what it's about. | |
| </div> | |
| """) | |
| # Model info | |
| output_html.append(""" | |
| <div class="alert alert-info"> | |
| <h4><i class="fas fa-tools"></i> Models & Techniques Used:</h4> | |
| <ul> | |
| <li><b>Zero-shot Classification</b> - BART model that can classify text without specific training</li> | |
| <li><b>TF-IDF Vectorizer</b> - Statistical method to identify important terms</li> | |
| <li><b>Word/Phrase Analysis</b> - Extraction of important n-grams</li> | |
| </ul> | |
| </div> | |
| """) | |
| try: | |
| # Ensure NLTK resources are downloaded | |
| download_nltk_resources() | |
| # Check if text is long enough for meaningful analysis | |
| if len(text_input.split()) < 50: | |
| output_html.append(f""" | |
| <div class="alert alert-warning"> | |
| <h3>Text Too Short for Full Topic Analysis</h3> | |
| <p>The provided text contains only {len(text_input.split())} words. | |
| For meaningful topic analysis, please provide a longer text (at least 50 words). | |
| We'll still perform basic frequency analysis, but topic modeling results may not be reliable.</p> | |
| </div> | |
| """) | |
| # Text cleaning and preprocessing | |
| stop_words = set(stopwords.words('english')) | |
| lemmatizer = WordNetLemmatizer() | |
| def preprocess_text(text): | |
| # Tokenize | |
| tokens = word_tokenize(text.lower()) | |
| # Remove stopwords and non-alphabetic tokens | |
| filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words] | |
| # Lemmatize | |
| lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens] | |
| return lemmatized_tokens | |
| # Process the text | |
| processed_tokens = preprocess_text(text_input) | |
| processed_text = ' '.join(processed_tokens) | |
| # Add Topic Classification section | |
| output_html.append('<h3 class="task-subheader">Topic Classification</h3>') | |
| # Get topic classification | |
| main_topic, confidence, sorted_topics, topic_scores = classify_topic(text_input) | |
| # Display topic classification results | |
| output_html.append(f""" | |
| <div class="alert alert-success"> | |
| <p class="mb-0 fs-5">This text is primarily about <strong>{main_topic}</strong> with {confidence}% confidence</p> | |
| </div> | |
| """) | |
| # Display topic scores (stacked rows to avoid overlap) | |
| output_html.append('<div class="row">') | |
| # Row 1: Topic Relevance Chart (full width) | |
| output_html.append('<div class="col-12">') | |
| output_html.append('<h4>Topic Relevance</h4>') | |
| # Create horizontal bar chart for topic scores | |
| plt.figure(figsize=(10, 6)) | |
| topics = [topic for topic, score in sorted_topics] | |
| scores = [score for topic, score in sorted_topics] | |
| # Only show top topics for clarity | |
| top_n = min(10, len(topics)) | |
| y_pos = np.arange(top_n) | |
| # Get a color gradient | |
| colors = plt.cm.Blues(np.linspace(0.4, 0.8, top_n)) | |
| # Create horizontal bars | |
| bars = plt.barh(y_pos, [s * 100 for s in scores[:top_n]], color=colors) | |
| # Add labels and values | |
| for i, bar in enumerate(bars): | |
| width = bar.get_width() | |
| plt.text(width + 0.5, bar.get_y() + bar.get_height()/2, | |
| f"{width:.1f}%", | |
| va='center') | |
| plt.yticks(y_pos, topics[:top_n]) | |
| plt.xlabel('Relevance') | |
| plt.title('Topic Scores') | |
| plt.tight_layout() | |
| output_html.append(fig_to_html(plt.gcf())) | |
| output_html.append('</div>') | |
| output_html.append('</div>') # Close row 1 | |
| # Row 2: Topic Scores Table (full width) | |
| output_html.append('<div class="row mt-3">') | |
| output_html.append('<div class="col-12">') | |
| output_html.append('<h4>Topic Scores</h4>') | |
| # Create table of topic scores | |
| topic_scores_df = pd.DataFrame({ | |
| 'Rank': range(1, len(sorted_topics) + 1), | |
| 'Topic': [topic.capitalize() for topic, _ in sorted_topics], | |
| 'Confidence': [f"{score:.4f}" for _, score in sorted_topics] | |
| }) | |
| output_html.append(df_to_html_table(topic_scores_df)) | |
| output_html.append('</div>') | |
| output_html.append('</div>') # Close row 2 | |
| # Extract and display key phrases | |
| output_html.append('<h3 class="task-subheader">Key Phrases</h3>') | |
| # Extract key phrases | |
| key_phrases = extract_key_phrases(text_input) | |
| # Display key phrases in a table | |
| if key_phrases: | |
| phrase_df = pd.DataFrame({ | |
| 'Phrase': [phrase for phrase, _ in key_phrases], | |
| 'Frequency': [freq for _, freq in key_phrases] | |
| }) | |
| output_html.append('<div class="row">') | |
| # Row 1: Key phrases table (full width) | |
| output_html.append('<div class="col-12">') | |
| output_html.append(df_to_html_table(phrase_df)) | |
| output_html.append('</div>') | |
| # Row 2: Phrase cloud (full width) | |
| output_html.append('</div>') # Close row 1 | |
| output_html.append('<div class="row mt-3">') | |
| output_html.append('<div class="col-12">') | |
| output_html.append(create_phrase_cloud(key_phrases)) | |
| output_html.append('</div>') | |
| output_html.append('</div>') # Close row 2 | |
| else: | |
| output_html.append("<p>No key phrases could be extracted from the text.</p>") | |
| # Term Frequency Analysis | |
| output_html.append('<h3 class="task-subheader">Key Term Frequency Analysis</h3>') | |
| # Get token frequencies | |
| token_freq = Counter(processed_tokens) | |
| # Sort by frequency | |
| sorted_word_freq = dict(sorted(token_freq.items(), key=lambda item: item[1], reverse=True)) | |
| # Take top 25 words for visualization | |
| top_n = 25 | |
| top_words = list(sorted_word_freq.keys())[:top_n] | |
| top_freqs = list(sorted_word_freq.values())[:top_n] | |
| # Create visualization | |
| fig = plt.figure(figsize=(10, 6)) | |
| colors = plt.cm.viridis(np.linspace(0.3, 0.85, len(top_words))) | |
| bars = plt.bar(top_words, top_freqs, color=colors) | |
| plt.xlabel('Term') | |
| plt.ylabel('Frequency') | |
| plt.title(f'Top {top_n} Term Frequencies') | |
| plt.xticks(rotation=45, ha='right') | |
| plt.tight_layout() | |
| # Add value labels on top of bars | |
| for bar in bars: | |
| height = bar.get_height() | |
| plt.text(bar.get_x() + bar.get_width()/2., height + 0.1, | |
| f'{height}', | |
| ha='center', va='bottom', | |
| fontsize=8) | |
| # Show plots and table in stacked rows | |
| output_html.append('<div class="row">') | |
| # Row 1: Chart (full width) | |
| output_html.append('<div class="col-12">') | |
| output_html.append(fig_to_html(fig)) | |
| output_html.append('</div>') | |
| # Row 2: Top terms table (full width) | |
| output_html.append('</div>') # Close row 1 | |
| output_html.append('<div class="row mt-3">') | |
| output_html.append('<div class="col-12">') | |
| output_html.append('<h4>Top Terms</h4>') | |
| # Create DataFrame of top terms | |
| top_terms_df = pd.DataFrame({ | |
| 'Term': list(sorted_word_freq.keys())[:15], | |
| 'Frequency': list(sorted_word_freq.values())[:15] | |
| }) | |
| output_html.append(df_to_html_table(top_terms_df)) | |
| output_html.append('</div>') | |
| output_html.append('</div>') # Close row 2 | |
| # WordCloud visualization | |
| output_html.append('<h3 class="task-subheader">Word Cloud Visualization</h3>') | |
| output_html.append('<p>The size of each word represents its frequency in the text.</p>') | |
| # Generate word cloud | |
| wc = wordcloud.WordCloud( | |
| background_color='white', | |
| max_words=100, | |
| width=800, | |
| height=400, | |
| colormap='viridis', | |
| contour_width=1, | |
| contour_color='steelblue' | |
| ) | |
| wc.generate_from_frequencies(sorted_word_freq) | |
| # Create figure | |
| fig = plt.figure(figsize=(12, 6)) | |
| plt.imshow(wc, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.tight_layout() | |
| output_html.append(fig_to_html(fig)) | |
| # TF-IDF Analysis | |
| output_html.append('<h3 class="task-subheader">TF-IDF Analysis</h3>') | |
| output_html.append(""" | |
| <div class="alert alert-light"> | |
| <p class="mb-0"> | |
| Term Frequency-Inverse Document Frequency (TF-IDF) identifies terms that are distinctive to parts of the text. | |
| In this case, we treat each sentence as a separate "document" for the analysis. | |
| </p> | |
| </div> | |
| """) | |
| # Split text into sentences | |
| sentences = nltk.sent_tokenize(text_input) | |
| # Only perform TF-IDF if there are enough sentences | |
| if len(sentences) >= 3: | |
| # Create TF-IDF vectorizer | |
| tfidf_vectorizer = TfidfVectorizer( | |
| max_features=100, | |
| stop_words='english', | |
| min_df=1 | |
| ) | |
| # Fit and transform the sentences | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) | |
| # Get feature names | |
| feature_names = tfidf_vectorizer.get_feature_names_out() | |
| # Create a table of top TF-IDF terms for each sentence | |
| tfidf_data = [] | |
| for i, sentence in enumerate(sentences[:min(len(sentences), 5)]): # Show max 5 sentences to avoid clutter | |
| # Get top terms for this sentence | |
| tfidf_scores = tfidf_matrix[i].toarray()[0] | |
| top_indices = np.argsort(tfidf_scores)[-5:][::-1] # Top 5 terms | |
| top_terms = [feature_names[idx] for idx in top_indices] | |
| top_scores = [tfidf_scores[idx] for idx in top_indices] | |
| # Format for display | |
| formatted_terms = ', '.join([f"{term} ({score:.3f})" for term, score in zip(top_terms, top_scores)]) | |
| shortened_sentence = (sentence[:75] + '...') if len(sentence) > 75 else sentence | |
| tfidf_data.append({ | |
| 'Sentence': shortened_sentence, | |
| 'Distinctive Terms (TF-IDF scores)': formatted_terms | |
| }) | |
| # Create dataframe | |
| tfidf_df = pd.DataFrame(tfidf_data) | |
| output_html.append('<div class="mt-3">') | |
| output_html.append(df_to_html_table(tfidf_df)) | |
| output_html.append('</div>') | |
| # Create a TF-IDF term-sentence heatmap | |
| if len(sentences) <= 10: # Only create heatmap for reasonable number of sentences | |
| # Get top terms across all sentences | |
| mean_tfidf = np.mean(tfidf_matrix.toarray(), axis=0) | |
| top_indices = np.argsort(mean_tfidf)[-10:][::-1] # Top 10 terms | |
| top_terms = [feature_names[idx] for idx in top_indices] | |
| # Create heatmap data | |
| heatmap_data = tfidf_matrix[:, top_indices].toarray() | |
| # Create heatmap | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| plt.imshow(heatmap_data, cmap='viridis', aspect='auto') | |
| # Add labels | |
| plt.yticks(range(len(sentences)), [f"Sent {i+1}" for i in range(len(sentences))]) | |
| plt.xticks(range(len(top_terms)), top_terms, rotation=45, ha='right') | |
| plt.colorbar(label='TF-IDF Score') | |
| plt.xlabel('Terms') | |
| plt.ylabel('Sentences') | |
| plt.title('TF-IDF Heatmap: Term Importance by Sentence') | |
| plt.tight_layout() | |
| output_html.append('<h4>Term Importance Heatmap</h4>') | |
| output_html.append('<p>This heatmap shows which terms are most distinctive in each sentence.</p>') | |
| output_html.append(fig_to_html(fig)) | |
| else: | |
| output_html.append(""" | |
| <div class="alert alert-warning"> | |
| <p class="mb-0">TF-IDF analysis requires at least 3 sentences. The provided text doesn't have enough sentences for this analysis.</p> | |
| </div> | |
| """) | |
| # Topic Modeling | |
| output_html.append('<h3 class="task-subheader">Topic Modeling</h3>') | |
| output_html.append(""" | |
| <div class="alert alert-light"> | |
| <p class="mb-0"> | |
| Topic modeling uses statistical methods to discover abstract "topics" that occur in a collection of documents. | |
| Here, we use Latent Dirichlet Allocation (LDA) to identify potential topics. | |
| </p> | |
| </div> | |
| """) | |
| # Check if text is long enough for topic modeling | |
| if len(text_input.split()) < 50: | |
| output_html.append(""" | |
| <div class="alert alert-warning"> | |
| <p class="mb-0">Topic modeling works best with longer texts. The provided text is too short for reliable topic modeling.</p> | |
| </div> | |
| """) | |
| else: | |
| # Create document-term matrix | |
| # For short single-document text, we'll split by sentences to create a "corpus" | |
| sentences = nltk.sent_tokenize(text_input) | |
| if len(sentences) < 4: | |
| output_html.append(""" | |
| <div class="alert alert-warning"> | |
| <p class="mb-0">Topic modeling works best with multiple documents or paragraphs. Since the provided text has few sentences, | |
| the topic modeling results may not be meaningful.</p> | |
| </div> | |
| """) | |
| # Create document-term matrix using CountVectorizer | |
| vectorizer = CountVectorizer( | |
| max_features=1000, | |
| stop_words='english', | |
| min_df=1 | |
| ) | |
| # Create a document-term matrix | |
| dtm = vectorizer.fit_transform(sentences) | |
| feature_names = vectorizer.get_feature_names_out() | |
| # Set number of topics based on text length | |
| n_topics = min(3, max(2, len(sentences) // 3)) | |
| # LDA Topic Modeling | |
| lda_model = LatentDirichletAllocation( | |
| n_components=n_topics, | |
| max_iter=10, | |
| learning_method='online', | |
| random_state=42 | |
| ) | |
| lda_model.fit(dtm) | |
| # Get top terms for each topic | |
| n_top_words = 10 | |
| topic_terms = [] | |
| for topic_idx, topic in enumerate(lda_model.components_): | |
| top_indices = topic.argsort()[:-n_top_words - 1:-1] | |
| top_terms = [feature_names[i] for i in top_indices] | |
| topic_weight = topic[top_indices].sum() / topic.sum() # Approximation of topic "importance" | |
| topic_terms.append({ | |
| "Topic": f"Topic {topic_idx + 1}", | |
| "Top Terms": ", ".join(top_terms), | |
| "Weight": f"{topic_weight:.2f}" | |
| }) | |
| topic_df = pd.DataFrame(topic_terms) | |
| output_html.append('<h4>LDA Topic Model Results</h4>') | |
| output_html.append(df_to_html_table(topic_df)) | |
| # Create word cloud for each topic | |
| output_html.append('<h4>Topic Word Clouds</h4>') | |
| output_html.append('<div class="row">') | |
| for topic_idx, topic in enumerate(lda_model.components_): | |
| # Get topic words and weights | |
| word_weights = {feature_names[i]: topic[i] for i in topic.argsort()[:-50-1:-1]} | |
| # Generate word cloud | |
| wc = wordcloud.WordCloud( | |
| background_color='white', | |
| max_words=30, | |
| width=400, | |
| height=300, | |
| colormap='plasma', | |
| contour_width=1, | |
| contour_color='steelblue' | |
| ) | |
| wc.generate_from_frequencies(word_weights) | |
| # Create figure | |
| fig = plt.figure(figsize=(6, 4)) | |
| plt.imshow(wc, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.title(f'Topic {topic_idx + 1}') | |
| plt.tight_layout() | |
| output_html.append(f'<div class="col-12 mb-3">') | |
| output_html.append(fig_to_html(fig)) | |
| output_html.append('</div>') | |
| output_html.append('</div>') # Close row for word clouds | |
| # Topic distribution visualization | |
| topic_distribution = lda_model.transform(dtm) | |
| # Calculate dominant topic for each sentence | |
| dominant_topics = np.argmax(topic_distribution, axis=1) | |
| # Count number of sentences for each dominant topic | |
| topic_counts = Counter(dominant_topics) | |
| # Prepare data for visualization | |
| topics = [f"Topic {i+1}" for i in range(n_topics)] | |
| counts = [topic_counts.get(i, 0) for i in range(n_topics)] | |
| # Create visualization | |
| fig = plt.figure(figsize=(8, 5)) | |
| bars = plt.bar(topics, counts, color=plt.cm.plasma(np.linspace(0.15, 0.85, n_topics))) | |
| # Add value labels | |
| for bar in bars: | |
| height = bar.get_height() | |
| plt.text(bar.get_x() + bar.get_width()/2., height + 0.1, | |
| f'{height}', | |
| ha='center', va='bottom') | |
| plt.xlabel('Topic') | |
| plt.ylabel('Number of Sentences') | |
| plt.title('Distribution of Dominant Topics Across Sentences') | |
| plt.tight_layout() | |
| output_html.append('<h4>Topic Distribution</h4>') | |
| output_html.append(fig_to_html(fig)) | |
| # Topic network graph | |
| output_html.append('<h4>Topic-Term Network</h4>') | |
| output_html.append('<p>This visualization shows the relationships between topics and their most important terms.</p>') | |
| # Create network graph | |
| G = nx.Graph() | |
| # Add topic nodes | |
| for i in range(n_topics): | |
| G.add_node(f"Topic {i+1}", type='topic', size=1000) | |
| # Add term nodes and edges | |
| for topic_idx, topic in enumerate(lda_model.components_): | |
| topic_name = f"Topic {topic_idx+1}" | |
| # Get top terms for this topic | |
| top_indices = topic.argsort()[:-11:-1] | |
| for i in top_indices: | |
| term = feature_names[i] | |
| weight = topic[i] | |
| # Only add terms with significant weight | |
| if weight > 0.01: | |
| if not G.has_node(term): | |
| G.add_node(term, type='term', size=300) | |
| G.add_edge(topic_name, term, weight=weight) | |
| # Create graph visualization | |
| fig = plt.figure(figsize=(10, 8)) | |
| # Position nodes using spring layout | |
| pos = nx.spring_layout(G, k=0.3, seed=42) | |
| # Draw nodes | |
| topic_nodes = [node for node in G.nodes() if G.nodes[node]['type'] == 'topic'] | |
| term_nodes = [node for node in G.nodes() if G.nodes[node]['type'] == 'term'] | |
| # Draw topic nodes | |
| nx.draw_networkx_nodes( | |
| G, pos, | |
| nodelist=topic_nodes, | |
| node_color='#E53935', | |
| node_size=[G.nodes[node]['size'] for node in topic_nodes], | |
| alpha=0.8 | |
| ) | |
| # Draw term nodes | |
| nx.draw_networkx_nodes( | |
| G, pos, | |
| nodelist=term_nodes, | |
| node_color='#1976D2', | |
| node_size=[G.nodes[node]['size'] for node in term_nodes], | |
| alpha=0.6 | |
| ) | |
| # Draw edges with varying thickness | |
| edge_weights = [G[u][v]['weight'] * 5 for u, v in G.edges()] | |
| nx.draw_networkx_edges( | |
| G, pos, | |
| width=edge_weights, | |
| alpha=0.5, | |
| edge_color='gray' | |
| ) | |
| # Draw labels | |
| nx.draw_networkx_labels( | |
| G, pos, | |
| font_size=10, | |
| font_weight='bold' | |
| ) | |
| plt.axis('off') | |
| plt.tight_layout() | |
| output_html.append(fig_to_html(fig)) | |
| # Add note about interpreting results | |
| output_html.append(""" | |
| <div class="alert alert-info"> | |
| <h4>Interpreting Topic Models</h4> | |
| <p>Topic modeling is an unsupervised technique that works best with large collections of documents. | |
| For a single text, especially shorter ones, topics may be less distinct or meaningful. | |
| The "topics" shown here represent clusters of words that frequently appear together in the text.</p> | |
| <p>For better topic modeling results:</p> | |
| <ul> | |
| <li>Use longer texts with at least several paragraphs</li> | |
| <li>Provide multiple related documents for analysis</li> | |
| <li>Consider domain-specific preprocessing</li> | |
| </ul> | |
| </div> | |
| """) | |
| except Exception as e: | |
| output_html.append(f""" | |
| <div class="alert alert-danger"> | |
| <h3>Error</h3> | |
| <p>Failed to analyze topics: {str(e)}</p> | |
| </div> | |
| """) | |
| # About Topic Analysis section | |
| output_html.append(""" | |
| <div class="card mt-4"> | |
| <div class="card-header"> | |
| <h4 class="mb-0"> | |
| <i class="fas fa-info-circle"></i> | |
| About Topic Analysis | |
| </h4> | |
| </div> | |
| <div class="card-body"> | |
| <h5>What is Topic Analysis?</h5> | |
| <p>Topic analysis, also known as topic modeling or topic extraction, is the process of identifying the main themes | |
| or topics that occur in a collection of documents. It uses statistical models to discover abstract topics based | |
| on word distributions throughout the texts.</p> | |
| <h5>Common Approaches:</h5> | |
| <ul> | |
| <li><b>Term Frequency Analysis</b> - Simple counting of terms to find the most common topics</li> | |
| <li><b>TF-IDF (Term Frequency-Inverse Document Frequency)</b> - Identifies terms that are distinctive to particular documents or sections</li> | |
| <li><b>LDA (Latent Dirichlet Allocation)</b> - A probabilistic model that assigns topic distributions to documents</li> | |
| <li><b>NMF (Non-negative Matrix Factorization)</b> - A linear-algebraic approach to topic discovery</li> | |
| <li><b>BERTopic</b> - A modern approach that uses BERT embeddings and clustering for topic modeling</li> | |
| </ul> | |
| <h5>Applications:</h5> | |
| <ul> | |
| <li><b>Content organization</b> - Categorizing documents by topic</li> | |
| <li><b>Trend analysis</b> - Tracking how topics evolve over time</li> | |
| <li><b>Content recommendation</b> - Suggesting related content based on topic similarity</li> | |
| <li><b>Customer feedback analysis</b> - Understanding main themes in reviews or feedback</li> | |
| <li><b>Research insights</b> - Identifying research themes in academic papers</li> | |
| </ul> | |
| </div> | |
| </div> | |
| """) | |
| output_html.append('</div>') # Close result-area div | |
| return '\n'.join(output_html) | |