|
import streamlit as st |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import numpy as np |
|
from collections import defaultdict, Counter |
|
import base64 |
|
from sklearn.manifold import MDS |
|
import networkx as nx |
|
|
|
st.set_page_config(layout="wide") |
|
|
|
def extract_word_and_chars(token): |
|
if token == '$' or '<START>' in token or '<END>' in token: |
|
return None, None |
|
|
|
chars = [] |
|
temp_chars = token.split(',') |
|
|
|
for char in temp_chars: |
|
if '?' in char: |
|
base_char = char.replace('?', '') |
|
if base_char: |
|
chars.append(base_char) |
|
chars.append('?') |
|
else: |
|
chars.append(char) |
|
|
|
word = ''.join(chars) |
|
return word, chars |
|
|
|
def analyze_csv(df): |
|
words = [] |
|
chars_list = [] |
|
char_positions = defaultdict(list) |
|
char_connections = defaultdict(Counter) |
|
word_positions = [] |
|
folio_word_map = defaultdict(Counter) |
|
|
|
for _, row in df.iterrows(): |
|
line_words = [] |
|
token_columns = [col for col in df.columns if col.startswith('t')] |
|
|
|
for pos, col in enumerate(token_columns): |
|
token = row[col] |
|
if pd.notna(token) and token != '$': |
|
if token.startswith('"') and token.endswith('"'): |
|
token = token[1:-1] |
|
|
|
word, chars = extract_word_and_chars(token) |
|
if word: |
|
words.append(word) |
|
chars_list.append(chars) |
|
line_words.append((word, pos, chars)) |
|
folio_word_map[row['folio']][word] += 1 |
|
|
|
for j, char in enumerate(chars): |
|
char_positions[char].append(j) |
|
|
|
for j in range(len(chars) - 1): |
|
char_connections[chars[j]][chars[j+1]] += 1 |
|
|
|
if line_words: |
|
word_positions.append({ |
|
'folio': row['folio'], |
|
'par': row['par'], |
|
'line': row['line'], |
|
'words': line_words |
|
}) |
|
|
|
return words, chars_list, char_positions, char_connections, word_positions, folio_word_map |
|
|
|
def analyze_trigrams(words, chars_list): |
|
char_trigrams = Counter() |
|
word_trigrams = Counter() |
|
|
|
for chars in chars_list: |
|
for i in range(len(chars)-2): |
|
trigram = tuple(chars[i:i+3]) |
|
char_trigrams[trigram] += 1 |
|
|
|
for i in range(len(words)-2): |
|
trigram = tuple(words[i:i+3]) |
|
word_trigrams[trigram] += 1 |
|
|
|
return char_trigrams, word_trigrams |
|
|
|
def create_12_slot_table(chars_list): |
|
slot_frequencies = [Counter() for _ in range(12)] |
|
|
|
for chars in chars_list: |
|
for i, char in enumerate(chars[:12]): |
|
slot_frequencies[i][char] += 1 |
|
|
|
data = [] |
|
all_chars = sorted(set(char for counter in slot_frequencies for char in counter)) |
|
|
|
for char in all_chars: |
|
row = {'Character': char} |
|
for i in range(12): |
|
row[f'Slot_{i+1}'] = slot_frequencies[i][char] |
|
data.append(row) |
|
|
|
return pd.DataFrame(data) |
|
|
|
def analyze_slot_structure(chars_list): |
|
slot_contents = defaultdict(Counter) |
|
max_slots = 0 |
|
|
|
for chars in chars_list: |
|
if len(chars) > max_slots: |
|
max_slots = len(chars) |
|
|
|
for i, char in enumerate(chars): |
|
slot_contents[i][char] += 1 |
|
|
|
slot_summary = {} |
|
for slot in range(max_slots): |
|
if slot in slot_contents: |
|
common_chars = slot_contents[slot].most_common(10) |
|
slot_summary[slot] = common_chars |
|
|
|
return slot_summary, max_slots |
|
|
|
def create_folio_word_scatter(folio_word_map): |
|
all_words = set() |
|
for word_counter in folio_word_map.values(): |
|
all_words.update(word_counter.keys()) |
|
|
|
folios = sorted(folio_word_map.keys()) |
|
word_freq_matrix = np.zeros((len(folios), len(all_words))) |
|
|
|
for i, folio in enumerate(folios): |
|
for j, word in enumerate(all_words): |
|
word_freq_matrix[i, j] = folio_word_map[folio][word] |
|
|
|
mds = MDS(n_components=2, random_state=42) |
|
folio_coords = mds.fit_transform(word_freq_matrix) |
|
|
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
scatter = ax.scatter(folio_coords[:, 0], folio_coords[:, 1]) |
|
|
|
for i, folio in enumerate(folios): |
|
ax.annotate(folio, (folio_coords[i, 0], folio_coords[i, 1])) |
|
|
|
ax.set_title('Folio Similarity based on Word Usage') |
|
ax.set_xlabel('Dimension 1') |
|
ax.set_ylabel('Dimension 2') |
|
|
|
return fig |
|
|
|
def plot_char_positions(char_positions, max_slots): |
|
chars = [] |
|
positions = [] |
|
counts = [] |
|
|
|
for char, pos_list in char_positions.items(): |
|
pos_counts = Counter(pos_list) |
|
for pos, count in pos_counts.items(): |
|
if pos < max_slots: |
|
chars.append(char) |
|
positions.append(pos) |
|
counts.append(count) |
|
|
|
df = pd.DataFrame({ |
|
'Character': chars, |
|
'Position': positions, |
|
'Count': counts |
|
}) |
|
|
|
pivot_df = df.pivot(index='Character', columns='Position', values='Count').fillna(0) |
|
|
|
fig, ax = plt.subplots(figsize=(15, 10)) |
|
sns.heatmap(pivot_df, cmap="YlGnBu", ax=ax) |
|
ax.set_title('Character Position Heatmap') |
|
ax.set_xlabel('Position in Word') |
|
ax.set_ylabel('Character') |
|
return fig |
|
|
|
def get_download_link_csv(df, filename): |
|
csv = df.to_csv(index=False) |
|
b64 = base64.b64encode(csv.encode()).decode() |
|
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download CSV</a>' |
|
return href |
|
|
|
st.title("Voynich Manuscript Analyzer") |
|
st.write("Upload your CSV file to discover potential patterns and character distributions.") |
|
|
|
uploaded_file = st.file_uploader("Choose a CSV file", type="csv") |
|
|
|
if uploaded_file is not None: |
|
df = pd.read_csv(uploaded_file) |
|
words, chars_list, char_positions, char_connections, word_positions, folio_word_map = analyze_csv(df) |
|
|
|
st.subheader("Basic Statistics") |
|
st.write(f"Total words: {len(words)}") |
|
st.write(f"Total unique words: {len(set(words))}") |
|
unique_chars = set() |
|
for chars in chars_list: |
|
unique_chars.update(chars) |
|
st.write(f"Total unique characters: {len(unique_chars)}") |
|
st.write("Unique characters:", ", ".join(sorted(unique_chars))) |
|
|
|
st.subheader("Trigram Analysis") |
|
char_trigrams, word_trigrams = analyze_trigrams(words, chars_list) |
|
|
|
st.write("Top 20 Character Trigrams") |
|
char_trigram_df = pd.DataFrame([ |
|
{'Trigram': ' - '.join(trigram), 'Count': count} |
|
for trigram, count in char_trigrams.most_common(20) |
|
]) |
|
st.dataframe(char_trigram_df) |
|
st.markdown(get_download_link_csv(char_trigram_df, "char_trigrams.csv"), unsafe_allow_html=True) |
|
|
|
st.write("Top 20 Word Trigrams") |
|
word_trigram_df = pd.DataFrame([ |
|
{'Trigram': ' - '.join(trigram), 'Count': count} |
|
for trigram, count in word_trigrams.most_common(20) |
|
]) |
|
st.dataframe(word_trigram_df) |
|
st.markdown(get_download_link_csv(word_trigram_df, "word_trigrams.csv"), unsafe_allow_html=True) |
|
|
|
st.subheader("Character Bigram Analysis") |
|
char_bigrams = Counter() |
|
for chars in chars_list: |
|
for i in range(len(chars)-1): |
|
bigram = tuple(chars[i:i+2]) |
|
char_bigrams[bigram] += 1 |
|
|
|
char_bigram_df = pd.DataFrame([ |
|
{'Bigram': ' - '.join(bigram), 'Count': count} |
|
for bigram, count in char_bigrams.most_common(20) |
|
]) |
|
st.dataframe(char_bigram_df) |
|
st.markdown(get_download_link_csv(char_bigram_df, "char_bigrams.csv"), unsafe_allow_html=True) |
|
|
|
st.subheader("Word Bigram Analysis") |
|
word_bigrams = Counter() |
|
for i in range(len(words)-1): |
|
bigram = tuple(words[i:i+2]) |
|
word_bigrams[bigram] += 1 |
|
|
|
word_bigram_df = pd.DataFrame([ |
|
{'Bigram': ' - '.join(bigram), 'Count': count} |
|
for bigram, count in word_bigrams.most_common(20) |
|
]) |
|
st.dataframe(word_bigram_df) |
|
st.markdown(get_download_link_csv(word_bigram_df, "word_bigrams.csv"), unsafe_allow_html=True) |
|
|
|
|
|
st.subheader("12-Slot Character Frequency Table") |
|
slot_freq_df = create_12_slot_table(chars_list) |
|
st.dataframe(slot_freq_df) |
|
st.markdown(get_download_link_csv(slot_freq_df, "slot_frequencies.csv"), unsafe_allow_html=True) |
|
|
|
slot_summary, max_slots = analyze_slot_structure(chars_list) |
|
|
|
st.subheader("Words by Length Analysis") |
|
|
|
|
|
length_groups = defaultdict(list) |
|
for word, chars in zip(words, chars_list): |
|
length = len(chars) |
|
if length <= 12: |
|
length_groups[length].append((word, chars)) |
|
|
|
|
|
selected_length = st.selectbox("Select word length to analyze:", |
|
sorted(length_groups.keys())) |
|
|
|
if selected_length: |
|
words_of_length = length_groups[selected_length] |
|
|
|
|
|
position_chars = [Counter() for _ in range(selected_length)] |
|
for _, chars in words_of_length: |
|
for i, char in enumerate(chars): |
|
position_chars[i][char] += 1 |
|
|
|
|
|
st.write(f"Found {len(words_of_length)} words of length {selected_length}") |
|
|
|
|
|
freq_data = [] |
|
|
|
for char in unique_chars: |
|
row = {'Character': char} |
|
for pos in range(selected_length): |
|
row[f'Pos_{pos+1}'] = position_chars[pos][char] |
|
freq_data.append(row) |
|
|
|
freq_df = pd.DataFrame(freq_data) |
|
st.dataframe(freq_df) |
|
st.markdown(get_download_link_csv(freq_df, f"length_{selected_length}_analysis.csv"), |
|
unsafe_allow_html=True) |
|
|
|
|
|
st.write("Sample words of this length:") |
|
sample_df = pd.DataFrame([ |
|
{'Word': word, 'Characters': ' '.join(chars)} |
|
for word, chars in words_of_length[:20] |
|
]) |
|
st.dataframe(sample_df) |
|
|
|
|
|
|
|
|
|
st.subheader("Word Distribution Across Folios") |
|
folio_scatter = create_folio_word_scatter(folio_word_map) |
|
st.pyplot(folio_scatter) |
|
|
|
st.subheader("Character Pattern Analysis") |
|
|
|
unique_chars = sorted(set(char for chars in chars_list for char in chars)) |
|
selected_char = st.selectbox("Select a character to analyze:", unique_chars) |
|
|
|
if selected_char: |
|
before_counter = Counter() |
|
after_counter = Counter() |
|
|
|
for chars in chars_list: |
|
for i, char in enumerate(chars): |
|
if char == selected_char: |
|
if i > 0: |
|
before_counter[chars[i-1]] += 1 |
|
if i < len(chars) - 1: |
|
after_counter[chars[i+1]] += 1 |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.write(f"Characters that commonly PRECEDE '{selected_char}':") |
|
before_df = pd.DataFrame(before_counter.most_common(10), |
|
columns=['Character', 'Count']) |
|
st.dataframe(before_df) |
|
|
|
|
|
fig1, ax1 = plt.subplots() |
|
plt.bar(before_df['Character'], before_df['Count']) |
|
plt.title(f"Characters before '{selected_char}'") |
|
plt.xticks(rotation=45) |
|
st.pyplot(fig1) |
|
|
|
with col2: |
|
st.write(f"Characters that commonly FOLLOW '{selected_char}':") |
|
after_df = pd.DataFrame(after_counter.most_common(10), |
|
columns=['Character', 'Count']) |
|
st.dataframe(after_df) |
|
|
|
|
|
fig2, ax2 = plt.subplots() |
|
plt.bar(after_df['Character'], after_df['Count']) |
|
plt.title(f"Characters after '{selected_char}'") |
|
plt.xticks(rotation=45) |
|
st.pyplot(fig2) |
|
|
|
st.subheader("Word Sequence Viewer") |
|
|
|
|
|
if 'current_folio' not in st.session_state: |
|
st.session_state.current_folio = '' |
|
if 'current_line' not in st.session_state: |
|
st.session_state.current_line = '' |
|
|
|
|
|
available_folios = sorted(set(line_data['folio'] for line_data in word_positions)) |
|
selected_folio = st.selectbox("Select Folio:", [''] + available_folios, |
|
key='folio_select', |
|
on_change=lambda: setattr(st.session_state, 'current_line', '')) |
|
|
|
|
|
available_lines = [] |
|
if selected_folio: |
|
available_lines = [(line_data['par'], line_data['line']) |
|
for line_data in word_positions |
|
if line_data['folio'] == selected_folio] |
|
available_lines = sorted(set(available_lines)) |
|
|
|
|
|
selected_line = st.selectbox("Select Line:", |
|
[''] + [f"Par {par}, Line {line}" for par, line in available_lines]) |
|
|
|
|
|
if selected_folio and selected_line: |
|
par, line = map(int, selected_line.replace('Par ', '').replace('Line ', '').split(', ')) |
|
|
|
|
|
line_words = next((line_data['words'] |
|
for line_data in word_positions |
|
if line_data['folio'] == selected_folio |
|
and line_data['par'] == par |
|
and line_data['line'] == line), []) |
|
|
|
|
|
for word, _, chars in line_words: |
|
st.write(f"Word: {word}") |
|
cols = st.columns(12) |
|
for i in range(12): |
|
with cols[i]: |
|
char = chars[i] if i < len(chars) else "" |
|
st.markdown(f""" |
|
<div style=' |
|
width: 40px; |
|
height: 40px; |
|
border: 2px solid #ccc; |
|
display: flex; |
|
align-items: center; |
|
justify-content: center; |
|
font-size: 20px; |
|
background-color: {"#e6f3ff" if char else "white"}; |
|
margin: 2px; |
|
'> |
|
{char} |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
st.subheader("Line Viewer") |
|
|
|
|
|
available_folios = sorted(set(line_data['folio'] for line_data in word_positions)) |
|
selected_folio = st.selectbox("Select Folio for Line View:", [''] + available_folios) |
|
|
|
|
|
if selected_folio: |
|
available_lines = [(line_data['par'], line_data['line']) |
|
for line_data in word_positions |
|
if line_data['folio'] == selected_folio] |
|
available_lines = sorted(set(available_lines)) |
|
|
|
|
|
selected_line = st.selectbox("Select Line:", |
|
[''] + [f"Par {par}, Line {line}" for par, line in available_lines]) |
|
|
|
|
|
if selected_line: |
|
par, line = map(int, selected_line.replace('Par ', '').replace('Line ', '').split(', ')) |
|
|
|
|
|
line_words = next((line_data['words'] |
|
for line_data in word_positions |
|
if line_data['folio'] == selected_folio |
|
and line_data['par'] == par |
|
and line_data['line'] == line), []) |
|
|
|
|
|
for word, _, chars in line_words: |
|
st.write(f"Word: {word}") |
|
cols = st.columns(12) |
|
for i in range(12): |
|
with cols[i]: |
|
char = chars[i] if i < len(chars) else "" |
|
st.markdown(f""" |
|
<div style=' |
|
width: 40px; |
|
height: 40px; |
|
border: 2px solid #ccc; |
|
display: flex; |
|
align-items: center; |
|
justify-content: center; |
|
font-size: 20px; |
|
background-color: {"#e6f3ff" if char else "white"}; |
|
margin: 2px; |
|
'> |
|
{char} |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
st.subheader("Language Structure Analysis") |
|
|
|
|
|
fig1 = plt.figure(figsize=(10, 6)) |
|
word_lengths = [len(chars) for chars in chars_list] |
|
sns.histplot(word_lengths, bins=range(1, 14)) |
|
plt.title("Word Length Distribution") |
|
plt.xlabel("Word Length") |
|
plt.ylabel("Frequency") |
|
st.pyplot(fig1) |
|
|
|
|
|
char_pos_matrix = np.zeros((len(unique_chars), 12)) |
|
for chars in chars_list: |
|
for i, char in enumerate(chars): |
|
if i < 12: |
|
char_idx = unique_chars.index(char) |
|
char_pos_matrix[char_idx, i] += 1 |
|
|
|
fig2 = plt.figure(figsize=(12, 8)) |
|
sns.heatmap(char_pos_matrix, |
|
xticklabels=range(1, 13), |
|
yticklabels=unique_chars, |
|
cmap='YlOrRd') |
|
plt.title("Character Position Preferences") |
|
plt.xlabel("Position in Word") |
|
plt.ylabel("Character") |
|
st.pyplot(fig2) |
|
|
|
|
|
st.subheader("Word Position Analysis") |
|
|
|
word_positions_in_lines = [] |
|
line_lengths = [] |
|
|
|
for line_data in word_positions: |
|
line_len = len(line_data['words']) |
|
line_lengths.append(line_len) |
|
for pos, (word, _, chars) in enumerate(line_data['words']): |
|
word_positions_in_lines.append({ |
|
'position': pos + 1, |
|
'word_length': len(chars), |
|
'line_length': line_len |
|
}) |
|
|
|
pos_df = pd.DataFrame(word_positions_in_lines) |
|
|
|
fig3 = plt.figure(figsize=(10, 6)) |
|
sns.boxplot(data=pos_df, x='position', y='word_length') |
|
plt.title("Word Length by Position in Line") |
|
plt.xlabel("Position in Line") |
|
plt.ylabel("Word Length") |
|
st.pyplot(fig3) |
|
|
|
|
|
char_bigrams = Counter() |
|
for chars in chars_list: |
|
for i in range(len(chars)-1): |
|
char_bigrams[tuple(chars[i:i+2])] += 1 |
|
|
|
|
|
G = nx.Graph() |
|
for (char1, char2), count in char_bigrams.most_common(20): |
|
G.add_edge(char1, char2, weight=count) |
|
|
|
fig4 = plt.figure(figsize=(10, 10)) |
|
pos = nx.spring_layout(G) |
|
|
|
|
|
edge_weights = [G[u][v]['weight'] for u,v in G.edges()] |
|
max_weight = max(edge_weights) if edge_weights else 1 |
|
|
|
nx.draw(G, pos, with_labels=True, |
|
node_color='lightblue', |
|
node_size=1000, |
|
font_size=12, |
|
width=[G[u][v]['weight']/max_weight * 5 for u,v in G.edges()]) |
|
plt.title("Top Character Connections") |
|
st.pyplot(fig4) |
|
|
|
|
|
|
|
fig5 = plt.figure(figsize=(10, 6)) |
|
sns.histplot(line_lengths) |
|
plt.title("Words per Line Distribution") |
|
plt.xlabel("Number of Words in Line") |
|
plt.ylabel("Frequency") |
|
st.pyplot(fig5) |
|
|
|
|
|
first_chars = Counter(chars[0] for chars in chars_list) |
|
last_chars = Counter(chars[-1] for chars in chars_list) |
|
|
|
fig6, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6)) |
|
|
|
|
|
first_df = pd.DataFrame(first_chars.most_common(10), |
|
columns=['Character', 'Count']) |
|
sns.barplot(data=first_df, x='Character', y='Count', ax=ax1) |
|
ax1.set_title("Most Common Initial Characters") |
|
ax1.tick_params(axis='x', rotation=45) |
|
|
|
|
|
last_df = pd.DataFrame(last_chars.most_common(10), |
|
columns=['Character', 'Count']) |
|
sns.barplot(data=last_df, x='Character', y='Count', ax=ax2) |
|
ax2.set_title("Most Common Final Characters") |
|
ax2.tick_params(axis='x', rotation=45) |
|
st.pyplot(fig6) |
|
|
|
|
|
char_trigrams = Counter() |
|
for chars in chars_list: |
|
if len(chars) >= 3: |
|
for i in range(len(chars)-2): |
|
char_trigrams[tuple(chars[i:i+3])] += 1 |
|
|
|
|
|
trigram_df = pd.DataFrame([ |
|
{'Trigram': ' - '.join(trigram), 'Count': count} |
|
for trigram, count in char_trigrams.most_common(20) |
|
]) |
|
st.write("Most Common Character Sequences (Trigrams)") |
|
st.dataframe(trigram_df) |
|
|
|
|
|
word_lengths_by_line = [] |
|
for line_data in word_positions: |
|
line_word_lengths = [len(chars) for _, _, chars in line_data['words']] |
|
if len(line_word_lengths) >= 5: |
|
word_lengths_by_line.append(line_word_lengths[:5]) |
|
|
|
if word_lengths_by_line: |
|
length_corr = np.corrcoef(np.array(word_lengths_by_line).T) |
|
fig8 = plt.figure(figsize=(8, 8)) |
|
sns.heatmap(length_corr, |
|
annot=True, |
|
cmap='coolwarm', |
|
xticklabels=range(1, 6), |
|
yticklabels=range(1, 6)) |
|
plt.title("Word Length Correlations by Position") |
|
st.pyplot(fig8) |
|
st.subheader("Advanced Grammar Pattern Analysis") |
|
|
|
|
|
pos_len_char_data = [] |
|
for line_data in word_positions: |
|
for pos, (word, _, chars) in enumerate(line_data['words']): |
|
pos_len_char_data.append({ |
|
'position': pos + 1, |
|
'length': len(chars), |
|
'first_char': chars[0], |
|
'last_char': chars[-1] |
|
}) |
|
|
|
pos_len_df = pd.DataFrame(pos_len_char_data) |
|
|
|
fig_plc = plt.figure(figsize=(12, 6)) |
|
pivot_data = pos_len_df.pivot_table( |
|
index='position', |
|
columns='length', |
|
values='first_char', |
|
aggfunc='count', |
|
fill_value=0 |
|
) |
|
sns.heatmap(pivot_data, cmap='YlOrRd') |
|
plt.title("Word Length-Position Distribution with Character Markers") |
|
st.pyplot(fig_plc) |
|
|
|
|
|
position_bigrams = defaultdict(Counter) |
|
for line_data in word_positions: |
|
for pos, (word, _, chars) in enumerate(line_data['words']): |
|
for i in range(len(chars)-1): |
|
bigram = tuple(chars[i:i+2]) |
|
position_bigrams[pos+1][bigram] += 1 |
|
|
|
|
|
for position in range(1, 5): |
|
fig_bp = plt.figure(figsize=(8, 8)) |
|
G = nx.Graph() |
|
for (char1, char2), count in position_bigrams[position].most_common(15): |
|
G.add_edge(char1, char2, weight=count) |
|
|
|
pos = nx.spring_layout(G) |
|
edge_weights = [G[u][v]['weight'] for u,v in G.edges()] |
|
max_weight = max(edge_weights) if edge_weights else 1 |
|
|
|
nx.draw(G, pos, with_labels=True, |
|
node_color='lightblue', |
|
node_size=1000, |
|
font_size=12, |
|
width=[G[u][v]['weight']/max_weight * 5 for u,v in G.edges()]) |
|
|
|
plt.title(f"Character Connections in Position {position}") |
|
st.pyplot(fig_bp) |
|
|
|
|
|
pattern_matrix = defaultdict(lambda: defaultdict(int)) |
|
for chars in chars_list: |
|
length = len(chars) |
|
pattern = (chars[0], chars[-1]) |
|
pattern_matrix[length][pattern] += 1 |
|
|
|
|
|
pattern_data = [] |
|
for length in range(1, 13): |
|
for (first, last), count in pattern_matrix[length].items(): |
|
pattern_data.append({ |
|
'length': length, |
|
'pattern': f"{first}-{last}", |
|
'count': count |
|
}) |
|
|
|
pattern_df = pd.DataFrame(pattern_data) |
|
fig_pat = plt.figure(figsize=(15, 8)) |
|
pivot_patterns = pattern_df.pivot_table( |
|
index='pattern', |
|
columns='length', |
|
values='count', |
|
fill_value=0 |
|
) |
|
sns.heatmap(pivot_patterns, cmap='YlOrRd') |
|
plt.title("Word Length-Pattern Distribution") |
|
st.pyplot(fig_pat) |
|
|
|
|
|
feature_data = [] |
|
for line_data in word_positions: |
|
for pos, (word, _, chars) in enumerate(line_data['words']): |
|
feature_data.append({ |
|
'position': pos + 1, |
|
'length': len(chars), |
|
'initial_char_type': chars[0] in 'aeiou', |
|
'final_char_type': chars[-1] in 'aeiou', |
|
'has_special': any(c in '?^' for c in chars) |
|
}) |
|
|
|
feature_df = pd.DataFrame(feature_data) |
|
corr_matrix = feature_df.corr() |
|
|
|
fig_corr = plt.figure(figsize=(10, 10)) |
|
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm') |
|
plt.title("Cross-Feature Correlation Matrix") |
|
st.pyplot(fig_corr) |
|
|
|
st.subheader("Pattern Discovery") |
|
|
|
|
|
def find_related_patterns(word_chars): |
|
patterns = defaultdict(list) |
|
for chars in word_chars: |
|
|
|
for i in range(len(chars)): |
|
template = list(chars) |
|
template[i] = '*' |
|
pattern_key = tuple(template) |
|
patterns[pattern_key].append(''.join(chars)) |
|
return {k: v for k, v in patterns.items() if len(v) > 1} |
|
|
|
related_patterns = find_related_patterns(chars_list) |
|
st.write("Morphological Patterns (words differing by one character)") |
|
pattern_df = pd.DataFrame([ |
|
{'Pattern': ''.join(pattern).replace('*', '_'), |
|
'Related Words': ', '.join(words)} |
|
for pattern, words in list(related_patterns.items())[:20] |
|
]) |
|
st.dataframe(pattern_df) |
|
|
|
|
|
def find_recurring_sequences(word_positions): |
|
sequences = defaultdict(int) |
|
for line_data in word_positions: |
|
words = line_data['words'] |
|
for i in range(len(words)-1): |
|
seq = tuple((len(w[2]), w[2][0]) for w in words[i:i+2]) |
|
sequences[seq] += 1 |
|
return sequences |
|
|
|
recurring_seqs = find_recurring_sequences(word_positions) |
|
st.write("Common Word Sequences (length-initial patterns)") |
|
seq_df = pd.DataFrame([ |
|
{'Sequence': ' → '.join([f"({l},{c})" for l,c in seq]), |
|
'Count': count} |
|
for seq, count in sorted(recurring_seqs.items(), |
|
key=lambda x: x[1], |
|
reverse=True)[:15] |
|
]) |
|
st.dataframe(seq_df) |
|
|
|
|
|
pos_char_dist = defaultdict(lambda: defaultdict(int)) |
|
for line_data in word_positions: |
|
for word_pos, (_, _, chars) in enumerate(line_data['words']): |
|
for char_pos, char in enumerate(chars): |
|
pos_char_dist[word_pos][char_pos, char] += 1 |
|
|
|
|
|
fig, axes = plt.subplots(1, 3, figsize=(15, 5)) |
|
for word_pos in range(3): |
|
data = defaultdict(list) |
|
for (char_pos, char), count in pos_char_dist[word_pos].items(): |
|
data['char_pos'].append(char_pos) |
|
data['char'].append(char) |
|
data['count'].append(count) |
|
|
|
df = pd.DataFrame(data) |
|
pivot = df.pivot(index='char', columns='char_pos', values='count') |
|
sns.heatmap(pivot, ax=axes[word_pos], cmap='YlOrRd') |
|
axes[word_pos].set_title(f'Word Position {word_pos+1}') |
|
st.pyplot(fig) |
|
|
|
st.subheader("4. Character Connection Patterns") |
|
|
|
@st.cache_data |
|
def generate_char_network(chars_list): |
|
char_bigrams = Counter() |
|
for chars in chars_list: |
|
for i in range(len(chars)-1): |
|
char_bigrams[tuple(chars[i:i+2])] += 1 |
|
return char_bigrams |
|
|
|
char_bigrams = generate_char_network(chars_list) |
|
|
|
|
|
G = nx.Graph() |
|
edges_with_weights = [] |
|
|
|
|
|
total_bigrams = sum(char_bigrams.values()) |
|
|
|
|
|
for (char1, char2), count in char_bigrams.items(): |
|
if count > total_bigrams * 0.01: |
|
edges_with_weights.append((char1, char2, count)) |
|
|
|
|
|
edges_with_weights.sort(key=lambda x: x[2], reverse=True) |
|
edges_with_weights = edges_with_weights[:50] |
|
|
|
|
|
for char1, char2, weight in edges_with_weights: |
|
G.add_edge(char1, char2, weight=weight) |
|
|
|
fig4 = plt.figure(figsize=(15, 15)) |
|
pos = nx.spring_layout(G, k=1, seed=42) |
|
|
|
|
|
weights = [G[u][v]['weight'] for u,v in G.edges()] |
|
max_weight = max(weights) if weights else 1 |
|
edge_widths = [w/max_weight * 5 for w in weights] |
|
|
|
|
|
nx.draw(G, pos, |
|
with_labels=True, |
|
node_color='lightblue', |
|
node_size=2000, |
|
font_size=14, |
|
width=edge_widths) |
|
|
|
plt.title("Character Connection Network") |
|
st.pyplot(fig4, clear_figure=True) |
|
|
|
|