kritsadaK commited on
Commit
16ab143
·
1 Parent(s): 7934d70

Initial commit

Browse files
Files changed (2) hide show
  1. app.py +82 -55
  2. requirements.txt +1 -0
app.py CHANGED
@@ -2,7 +2,7 @@ import warnings
2
  import torchvision
3
  import torch
4
  import pandas as pd
5
- from transformers import pipeline, AutoTokenizer, AutoModel
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  import streamlit as st
8
 
@@ -10,41 +10,65 @@ import streamlit as st
10
  torchvision.disable_beta_transforms_warning()
11
  warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
12
 
13
- # Initialize fill-mask pipeline and model/tokenizer for embedding with slow tokenizer
14
- pipe = pipeline(
15
- "fill-mask",
16
- model="airesearch/wangchanberta-base-att-spm-uncased",
17
- tokenizer=AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False),
18
- framework="pt"
19
- )
20
- model = AutoModel.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
21
-
22
- # Function to generate embeddings for full sentences
 
 
 
 
 
23
  def get_embedding(text):
24
- inputs = pipe.tokenizer(text, return_tensors="pt")
25
  with torch.no_grad():
26
  outputs = model(**inputs)
27
- return outputs.last_hidden_state[:, 0, :].cpu().numpy()
28
 
29
  # Streamlit app setup
30
  st.title("Thai Full Sentence Similarity App")
31
 
32
- # Explanation of example usage
33
- st.markdown("""
34
- ### Example Sentence with Mask:
35
- **Input:** `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ"`
 
 
 
 
 
 
36
 
37
- In this example, the model will replace `<mask>` with possible locations in Thailand, such as:
38
- - "เชียงใหม่" for "Chiang Mai"
39
- - "เขาใหญ่" for "Khao Yai"
40
- - "ภูเก็ต" for "Phuket"
 
 
41
 
42
- The app will compute the similarity between the full sentences generated and the baseline sentence without `<mask>`.
 
 
 
 
 
 
 
 
 
 
 
43
  """)
44
 
45
  # User input box
46
  st.subheader("Input Text")
47
- input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ")
48
 
49
  # Ensure the input includes a `<mask>`
50
  if "<mask>" not in input_text:
@@ -53,42 +77,45 @@ if "<mask>" not in input_text:
53
 
54
  # Process the input when available
55
  if input_text:
56
- # Display input text
57
  st.write(f"Input Text: {input_text}")
58
 
59
- # Get baseline embedding for comparison (remove "<mask>" to get the full sentence)
60
  baseline_text = input_text.replace("<mask>", "")
61
  input_embedding = get_embedding(baseline_text)
62
 
63
  # Generate mask predictions and calculate similarity with the full sentences
64
  similarity_results = []
65
- result = pipe(input_text)
66
-
67
- for r in result:
68
- # Full predicted sentence
69
- prediction_text = r['sequence']
70
-
71
- # Calculate embedding and similarity for the full sentence
72
- prediction_embedding = get_embedding(prediction_text)
73
- similarity = cosine_similarity(input_embedding, prediction_embedding)[0][0]
74
-
75
- # Append results to the list
76
- similarity_results.append({
77
- "Prediction": prediction_text,
78
- "Similarity Score": similarity,
79
- "Model Score": r['score']
80
- })
81
-
82
- # Convert results to DataFrame for easy sorting and display
83
- df_results = pd.DataFrame(similarity_results).sort_values(by="Similarity Score", ascending=False)
84
-
85
- # Display all predictions sorted by similarity score
86
- st.subheader("All Predictions Sorted by Similarity")
87
- st.dataframe(df_results)
88
-
89
- # Find and display the most similar prediction
90
- most_similar = df_results.iloc[0]
91
- st.subheader("Most Similar Prediction")
92
- st.write(f"**Prediction**: {most_similar['Prediction']}")
93
- st.write(f"**Similarity Score**: {most_similar['Similarity Score']:.4f}")
94
- st.write(f"**Model Score**: {most_similar['Model Score']:.4f}")
 
 
 
 
 
2
  import torchvision
3
  import torch
4
  import pandas as pd
5
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
6
  from sklearn.metrics.pairwise import cosine_similarity
7
  import streamlit as st
8
 
 
10
  torchvision.disable_beta_transforms_warning()
11
  warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
12
 
13
+ # Load tokenizer and model with error handling for compatibility
14
+ try:
15
+ tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False)
16
+ model = AutoModelForMaskedLM.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
17
+ model_name = "airesearch/wangchanberta-base-att-spm-uncased"
18
+ except Exception:
19
+ st.warning("Switching to xlm-roberta-base model due to compatibility issues.")
20
+ tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
21
+ model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
22
+ model_name = "xlm-roberta-base"
23
+
24
+ # Initialize the fill-mask pipeline
25
+ pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer, framework="pt")
26
+
27
+ # Function to generate embeddings
28
  def get_embedding(text):
29
+ inputs = tokenizer(text, return_tensors="pt")
30
  with torch.no_grad():
31
  outputs = model(**inputs)
32
+ return outputs.logits[:, 0, :].cpu().numpy()
33
 
34
  # Streamlit app setup
35
  st.title("Thai Full Sentence Similarity App")
36
 
37
+ st.write("""
38
+ ### How This App Works
39
+ This app uses a mask-filling model to predict possible words or phrases that could fill in the `<mask>` token in a given sentence. It then calculates the similarity of each prediction with the original sentence to determine the most contextually appropriate completion.
40
+
41
+ ### Example Sentence
42
+ In this example, we have the following sentence in Thai with a `<mask>` token:
43
+ - **Input**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ"`
44
+ - **Translation**: "Many tourists choose to visit `<mask>` to experience nature."
45
+
46
+ The `<mask>` token represents a location popular for its natural beauty.
47
 
48
+ ### Potential Predictions
49
+ Here are some possible predictions the model might generate for `<mask>`:
50
+ 1. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"` - Chiang Mai
51
+ 2. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เขาใหญ่ เพื่อสัมผัสธรรมชาติ"` - Khao Yai
52
+ 3. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เกาะสมุย เพื่อสัมผัสธรรมชาติ"` - Koh Samui
53
+ 4. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน ภูเก็ต เพื่อสัมผัสธรรมชาติ"` - Phuket
54
 
55
+ ### Results Table
56
+ For each prediction, the app calculates:
57
+ - **Similarity Score**: Indicates how similar the predicted sentence is to the original input.
58
+ - **Model Score**: Represents the model's confidence in the predicted word for `<mask>`.
59
+
60
+ ### Most Similar Prediction
61
+ The app will display the most contextually similar prediction based on the similarity score. For example:
62
+ - **Most Similar Prediction**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"`
63
+ - **Similarity Score**: 0.89
64
+ - **Model Score**: 0.16
65
+
66
+ Feel free to enter your own sentence with `<mask>` and explore the predictions!
67
  """)
68
 
69
  # User input box
70
  st.subheader("Input Text")
71
+ input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "ผู้ใช้งานท่าอากาศยานนานาชาติ <mask> มีกว่าสามล้านคน")
72
 
73
  # Ensure the input includes a `<mask>`
74
  if "<mask>" not in input_text:
 
77
 
78
  # Process the input when available
79
  if input_text:
 
80
  st.write(f"Input Text: {input_text}")
81
 
82
+ # Generate baseline embedding (removing `<mask>` to get the full sentence)
83
  baseline_text = input_text.replace("<mask>", "")
84
  input_embedding = get_embedding(baseline_text)
85
 
86
  # Generate mask predictions and calculate similarity with the full sentences
87
  similarity_results = []
88
+
89
+ try:
90
+ result = pipe(input_text)
91
+
92
+ for r in result:
93
+ # Adjust based on observed output structure
94
+ prediction_text = r.get('sequence', '')
95
+
96
+ # Only proceed if we have a valid prediction text
97
+ if prediction_text:
98
+ prediction_embedding = get_embedding(prediction_text)
99
+ similarity = cosine_similarity(input_embedding, prediction_embedding)[0][0]
100
+ similarity_results.append({
101
+ "Prediction": prediction_text,
102
+ "Similarity Score": similarity,
103
+ "Model Score": r['score']
104
+ })
105
+
106
+ # Convert results to DataFrame for easy sorting and display
107
+ df_results = pd.DataFrame(similarity_results).sort_values(by="Similarity Score", ascending=False)
108
+
109
+ # Display all predictions sorted by similarity score
110
+ st.subheader("All Predictions Sorted by Similarity")
111
+ st.dataframe(df_results)
112
+
113
+ # Display the most similar prediction
114
+ most_similar = df_results.iloc[0]
115
+ st.subheader("Most Similar Prediction")
116
+ st.write(f"**Prediction**: {most_similar['Prediction']}")
117
+ st.write(f"**Similarity Score**: {most_similar['Similarity Score']:.4f}")
118
+ st.write(f"**Model Score**: {most_similar['Model Score']:.4f}")
119
+
120
+ except KeyError:
121
+ st.error("Unexpected model output structure; unable to retrieve predictions.")
requirements.txt CHANGED
@@ -4,3 +4,4 @@ streamlit
4
  pandas
5
  scikit-learn
6
  torchvision
 
 
4
  pandas
5
  scikit-learn
6
  torchvision
7
+ sentencepiece