File size: 6,159 Bytes
7f1d38b 355f3ac 6bf8d03 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 0e46985 e031c18 70dc026 e031c18 70dc026 e031c18 70dc026 e031c18 70dc026 e031c18 70dc026 e031c18 70dc026 e031c18 0e46985 e031c18 0e46985 e031c18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
#syntax_analysis.py
import streamlit as st
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
# Remove the global nlp model loading
# Define colors for grammatical categories
POS_COLORS = {
'ADJ': '#FFA07A', # Light Salmon
'ADP': '#98FB98', # Pale Green
'ADV': '#87CEFA', # Light Sky Blue
'AUX': '#DDA0DD', # Plum
'CCONJ': '#F0E68C', # Khaki
'DET': '#FFB6C1', # Light Pink
'INTJ': '#FF6347', # Tomato
'NOUN': '#90EE90', # Light Green
'NUM': '#FAFAD2', # Light Goldenrod Yellow
'PART': '#D3D3D3', # Light Gray
'PRON': '#FFA500', # Orange
'PROPN': '#20B2AA', # Light Sea Green
'SCONJ': '#DEB887', # Burlywood
'SYM': '#7B68EE', # Medium Slate Blue
'VERB': '#FF69B4', # Hot Pink
'X': '#A9A9A9', # Dark Gray
}
POS_TRANSLATIONS = {
'es': {
'ADJ': 'Adjetivo',
'ADP': 'Adposici贸n',
'ADV': 'Adverbio',
'AUX': 'Auxiliar',
'CCONJ': 'Conjunci贸n Coordinante',
'DET': 'Determinante',
'INTJ': 'Interjecci贸n',
'NOUN': 'Sustantivo',
'NUM': 'N煤mero',
'PART': 'Part铆cula',
'PRON': 'Pronombre',
'PROPN': 'Nombre Propio',
'SCONJ': 'Conjunci贸n Subordinante',
'SYM': 'S铆mbolo',
'VERB': 'Verbo',
'X': 'Otro',
},
'en': {
'ADJ': 'Adjective',
'ADP': 'Adposition',
'ADV': 'Adverb',
'AUX': 'Auxiliary',
'CCONJ': 'Coordinating Conjunction',
'DET': 'Determiner',
'INTJ': 'Interjection',
'NOUN': 'Noun',
'NUM': 'Number',
'PART': 'Particle',
'PRON': 'Pronoun',
'PROPN': 'Proper Noun',
'SCONJ': 'Subordinating Conjunction',
'SYM': 'Symbol',
'VERB': 'Verb',
'X': 'Other',
},
'fr': {
'ADJ': 'Adjectif',
'ADP': 'Adposition',
'ADV': 'Adverbe',
'AUX': 'Auxiliaire',
'CCONJ': 'Conjonction de Coordination',
'DET': 'D茅terminant',
'INTJ': 'Interjection',
'NOUN': 'Nom',
'NUM': 'Nombre',
'PART': 'Particule',
'PRON': 'Pronom',
'PROPN': 'Nom Propre',
'SCONJ': 'Conjonction de Subordination',
'SYM': 'Symbole',
'VERB': 'Verbe',
'X': 'Autre',
}
}
def count_pos(doc):
return Counter(token.pos_ for token in doc if token.pos_ != 'PUNCT')
def create_syntax_graph(doc, lang):
G = nx.DiGraph()
pos_counts = count_pos(doc)
word_nodes = {}
word_colors = {}
for token in doc:
if token.pos_ != 'PUNCT':
lower_text = token.text.lower()
if lower_text not in word_nodes:
node_id = len(word_nodes)
word_nodes[lower_text] = node_id
color = POS_COLORS.get(token.pos_, '#FFFFFF')
word_colors[lower_text] = color
G.add_node(node_id,
label=f"{token.text}\n[{POS_TRANSLATIONS[lang].get(token.pos_, token.pos_)}]",
pos=token.pos_,
size=pos_counts[token.pos_] * 500,
color=color)
if token.dep_ != "ROOT" and token.head.pos_ != 'PUNCT':
head_id = word_nodes.get(token.head.text.lower())
if head_id is not None:
G.add_edge(head_id, word_nodes[lower_text], label=token.dep_)
return G, word_colors
def visualize_syntax_graph(doc, lang):
G, word_colors = create_syntax_graph(doc, lang)
plt.figure(figsize=(24, 18)) # Increase figure size
pos = nx.spring_layout(G, k=0.9, iterations=50) # Adjust layout parameters
node_colors = [data['color'] for _, data in G.nodes(data=True)]
node_sizes = [data['size'] for _, data in G.nodes(data=True)]
nx.draw(G, pos, with_labels=False, node_color=node_colors, node_size=node_sizes, arrows=True,
arrowsize=20, width=2, edge_color='gray') # Adjust node and edge appearance
nx.draw_networkx_labels(G, pos, {node: data['label'] for node, data in G.nodes(data=True)},
font_size=10, font_weight='bold') # Increase font size and make bold
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=8)
plt.title("Syntactic Analysis" if lang == 'en' else "Analyse Syntaxique" if lang == 'fr' else "An谩lisis Sint谩ctico",
fontsize=20, fontweight='bold') # Increase title font size
plt.axis('off')
legend_elements = [plt.Rectangle((0,0),1,1, facecolor=color, edgecolor='none',
label=f"{POS_TRANSLATIONS[lang][pos]} ({count_pos(doc)[pos]})")
for pos, color in POS_COLORS.items() if pos in set(nx.get_node_attributes(G, 'pos').values())]
plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12) # Increase legend font size
return plt
def visualize_syntax(text, nlp, lang):
max_tokens = 5000
doc = nlp(text)
if len(doc) > max_tokens:
doc = nlp(text[:max_tokens])
print(f"Warning: The input text is too long. Only the first {max_tokens} tokens will be visualized.")
return visualize_syntax_graph(doc, lang)
def get_repeated_words_colors(doc):
word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
repeated_words = {word: count for word, count in word_counts.items() if count > 1}
word_colors = {}
for token in doc:
if token.text.lower() in repeated_words:
word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF')
return word_colors
def highlight_repeated_words(doc, word_colors):
highlighted_text = []
for token in doc:
if token.text.lower() in word_colors:
color = word_colors[token.text.lower()]
highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>')
else:
highlighted_text.append(token.text)
return ' '.join(highlighted_text) |