import streamlit as st import requests from transformers import pipeline import plotly.express as px import pandas as pd from collections import Counter import re def get_markdown_from_github(url): response = requests.get(url) markdown = response.text return markdown def preprocess_text(text): text = text.lower() text = re.sub('[^A-Za-z0-9]+', ' ', text) return text def get_most_frequent_words(text, n): words = re.findall(r'\b\w{5,}\b', text) word_count = Counter(words) most_common_words = word_count.most_common(n) return most_common_words def get_sentences_with_common_words(text, common_words): sentences = re.split('[.?!]', text) selected_sentences = [] for sentence in sentences: for word in common_words: if word in sentence: selected_sentences.append(sentence.strip()) break return selected_sentences def render_heatmap(words, sentences): df = pd.DataFrame(words, columns=['word', 'frequency']) fig = px.treemap(df, path=['word'], values='frequency', color='frequency', hover_data=['frequency'], color_continuous_scale='reds') st.plotly_chart(fig, use_container_width=True) def main(): st.title('Markdown Analyzer') # Get markdown from GitHub default_markdown_url = 'https://github.com/AaronCWacker/Yggdrasil/blob/main/README.md' markdown_url = st.sidebar.text_input("Enter a URL to analyze (default is provided):", default_markdown_url) markdown = get_markdown_from_github(markdown_url) # Preprocess text text = preprocess_text(markdown) # Get most frequent words n_most_frequent_words = st.sidebar.slider('Number of most frequent words to display', 1, 20, 10) most_frequent_words = get_most_frequent_words(text, n_most_frequent_words) # Get sentences containing common words common_words = [word for word, _ in most_frequent_words] sentences = get_sentences_with_common_words(text, common_words) # Render heatmap render_heatmap(most_frequent_words, sentences) if __name__ == '__main__': main()