Spaces:
Running
Running
import streamlit as st | |
import requests | |
from transformers import pipeline | |
import plotly.express as px | |
import pandas as pd | |
from collections import Counter | |
import re | |
def get_markdown_from_github(url): | |
response = requests.get(url) | |
markdown = response.text | |
return markdown | |
def preprocess_text(text): | |
text = text.lower() | |
text = re.sub('[^A-Za-z0-9]+', ' ', text) | |
return text | |
def get_most_frequent_words(text, n): | |
words = re.findall(r'\b\w{5,}\b', text) | |
word_count = Counter(words) | |
most_common_words = word_count.most_common(n) | |
return most_common_words | |
def get_sentences_with_common_words(text, common_words): | |
sentences = re.split('[.?!]', text) | |
selected_sentences = [] | |
for sentence in sentences: | |
for word in common_words: | |
if word in sentence: | |
selected_sentences.append(sentence.strip()) | |
break | |
return selected_sentences | |
def render_heatmap(words, sentences): | |
df = pd.DataFrame(words, columns=['word', 'frequency']) | |
fig = px.treemap(df, path=['word'], values='frequency', color='frequency', hover_data=['frequency'], color_continuous_scale='reds') | |
st.plotly_chart(fig, use_container_width=True) | |
def main(): | |
st.title('Markdown Analyzer') | |
# Get markdown from GitHub | |
default_markdown_url = 'https://github.com/AaronCWacker/Yggdrasil/blob/main/README.md' | |
markdown_url = st.sidebar.text_input("Enter a URL to analyze (default is provided):", default_markdown_url) | |
markdown = get_markdown_from_github(markdown_url) | |
# Preprocess text | |
text = preprocess_text(markdown) | |
# Get most frequent words | |
n_most_frequent_words = st.sidebar.slider('Number of most frequent words to display', 1, 20, 10) | |
most_frequent_words = get_most_frequent_words(text, n_most_frequent_words) | |
# Get sentences containing common words | |
common_words = [word for word, _ in most_frequent_words] | |
sentences = get_sentences_with_common_words(text, common_words) | |
# Render heatmap | |
render_heatmap(most_frequent_words, sentences) | |
if __name__ == '__main__': | |
main() | |