awacke1's picture
Update app.py
43216af
raw
history blame
No virus
2.11 kB
import streamlit as st
import requests
from transformers import pipeline
import plotly.express as px
import pandas as pd
from collections import Counter
import re
def get_markdown_from_github(url):
response = requests.get(url)
markdown = response.text
return markdown
def preprocess_text(text):
text = text.lower()
text = re.sub('[^A-Za-z0-9]+', ' ', text)
return text
def get_most_frequent_words(text, n):
words = re.findall(r'\b\w{5,}\b', text)
word_count = Counter(words)
most_common_words = word_count.most_common(n)
return most_common_words
def get_sentences_with_common_words(text, common_words):
sentences = re.split('[.?!]', text)
selected_sentences = []
for sentence in sentences:
for word in common_words:
if word in sentence:
selected_sentences.append(sentence.strip())
break
return selected_sentences
def render_heatmap(words, sentences):
df = pd.DataFrame(words, columns=['word', 'frequency'])
fig = px.treemap(df, path=['word'], values='frequency', color='frequency', hover_data=['frequency'], color_continuous_scale='reds')
st.plotly_chart(fig, use_container_width=True)
def main():
st.title('Markdown Analyzer')
# Get markdown from GitHub
default_markdown_url = 'https://github.com/AaronCWacker/Yggdrasil/blob/main/README.md'
markdown_url = st.sidebar.text_input("Enter a URL to analyze (default is provided):", default_markdown_url)
markdown = get_markdown_from_github(markdown_url)
# Preprocess text
text = preprocess_text(markdown)
# Get most frequent words
n_most_frequent_words = st.sidebar.slider('Number of most frequent words to display', 1, 20, 10)
most_frequent_words = get_most_frequent_words(text, n_most_frequent_words)
# Get sentences containing common words
common_words = [word for word, _ in most_frequent_words]
sentences = get_sentences_with_common_words(text, common_words)
# Render heatmap
render_heatmap(most_frequent_words, sentences)
if __name__ == '__main__':
main()