themeetjani commited on
Commit
1ef3d70
1 Parent(s): 5dc77e3

Upload 3 files

Browse files
Files changed (3) hide show
  1. pages/cg.py +50 -0
  2. pages/sp.py +15 -0
  3. pages/tc.py +154 -0
pages/cg.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit import session_state
3
+ import os
4
+ import openai
5
+ openai.api_key = os.getenv("OPENAI_API_KEY")
6
+ import pandas as pd
7
+ from sklearn.preprocessing import LabelEncoder
8
+ import numpy as np
9
+ def gpt4_score(schema, query):
10
+ response = openai.ChatCompletion.create(
11
+ model="gpt-4",
12
+ messages=[
13
+ {
14
+ "role": "system",
15
+ "content": "You are Code generator assistant. your task is to generate a code/query based on the instructions given in any language.\nif it's sql then accept query instruction also.\n\n<<REMEMBER>> Give only code or query. Don't provide any extra information.\n\n<<OUTPUT>>"
16
+ },
17
+ {
18
+ "role": "user",
19
+ "content": f"Schema/ Detail: {schema}"
20
+ },
21
+ {
22
+ "role": "user",
23
+ "content": f"Query/instruction: {query}"
24
+ }
25
+ ],
26
+ temperature=0.7,
27
+ max_tokens=701,
28
+ top_p=1,
29
+ frequency_penalty=0,
30
+ presence_penalty=0
31
+ )
32
+ return response.choices[0].message.content
33
+
34
+ st.write("# Auto Code Generation! 👋")
35
+
36
+
37
+ if 'score' not in session_state:
38
+ session_state['score']= ""
39
+
40
+ text1= st.text_area(label= "Please write the Schema or Detailed explaination bellow",
41
+ placeholder="What does the text say?")
42
+ text2= st.text_area(label= "Please write the Query or code instructions bellow",
43
+ placeholder="What does the text say?")
44
+ def classify(text1,text2):
45
+ session_state['score'] = gpt4_score(text1,text2)
46
+
47
+
48
+ st.text_area("result", value=session_state['score'])
49
+
50
+ st.button("Classify", on_click=classify, args=[text1,text2])
pages/sp.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as components
3
+
4
+ # bootstrap 4 collapse example
5
+ components.html(
6
+ """
7
+ <script
8
+ type="module"
9
+ src="https://gradio.s3-us-west-2.amazonaws.com/3.39.0/gradio.js"
10
+ ></script>
11
+
12
+ <gradio-app src="https://themeetjani-speech2.hf.space"></gradio-app>
13
+ """,
14
+ height=1600,
15
+ )
pages/tc.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit import session_state
3
+ import numpy as np
4
+ import json
5
+ from io import StringIO
6
+ import openai
7
+ import json
8
+ import os
9
+ import pandas as pd
10
+ from sentence_transformers import SentenceTransformer
11
+ import nltk
12
+ from nltk import word_tokenize
13
+ from nltk.corpus import stopwords
14
+ from sklearn.cluster import MiniBatchKMeans
15
+ from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
16
+ from sklearn.cluster import AgglomerativeClustering,k_means
17
+ from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
18
+ import numpy as np
19
+ nltk.download("stopwords")
20
+ import nltk
21
+ nltk.download('punkt')
22
+ #text preprocessing function
23
+ def clean_text_1(text):
24
+ stop_words = set(stopwords.words("english"))
25
+ def remove_stopwords(text):
26
+ return " ".join([word for word in str(text).split() if word not in stop_words])
27
+ text = remove_stopwords(text)
28
+ text = str(text).lower() # Lowercase words
29
+ text = re.sub(r"\[(.*?)\]", " ", text) # Remove [+XYZ chars] in content
30
+ text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
31
+ text = re.sub(r"\w+…|…", " ", text) # Remove ellipsis (and last word)
32
+ text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
33
+ # text = re.sub(stop_words, " ", text) # Replace dash between words
34
+ text = re.sub(
35
+ f"[{re.escape(string.punctuation)}]", "", text
36
+ ) # Remove punctuation
37
+ return text
38
+ import streamlit as st
39
+ import pandas as pd
40
+ import numpy as np
41
+ from sklearn.cluster import AgglomerativeClustering
42
+ from sklearn.manifold import TSNE
43
+ import matplotlib.pyplot as plt
44
+ import matplotlib.colors as mcolors
45
+ from sentence_transformers import SentenceTransformer
46
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
47
+ from openai import OpenAI
48
+ client = OpenAI()
49
+
50
+ # Load sentence transformer model
51
+ def get_embedding(text):
52
+ # Assuming you have a function clean_text_1 to clean the text
53
+ #text = clean_text_1(text)
54
+ return model.encode(text)
55
+
56
+ # Streamlit UI configuration
57
+ st.set_page_config(
58
+ page_title="text_clustering.py",
59
+ page_icon="👋",
60
+ )
61
+
62
+ # Upload file
63
+ uploaded_file = st.file_uploader("Choose a file")
64
+ if uploaded_file:
65
+ # Read data from file
66
+ df = pd.read_csv(uploaded_file)
67
+
68
+ # Clean data
69
+ df = df[df['text'].notna()].reset_index(drop=True)
70
+
71
+ # Get embeddings
72
+ df['embedding'] = df['text'].apply(get_embedding)
73
+ matrix = np.vstack(df['embedding'].values)
74
+
75
+ # Distance threshold slider
76
+ distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)
77
+
78
+ # Perform clustering
79
+ agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
80
+ cluster_labels = agg_clustering.fit_predict(matrix)
81
+ df['Cluster'] = cluster_labels
82
+ # Visualize clusters with t-SNE
83
+ tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
84
+ vis_dims2 = tsne.fit_transform(matrix)
85
+
86
+ x = [x for x, y in vis_dims2]
87
+ y = [y for x, y in vis_dims2]
88
+
89
+ unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
90
+
91
+ # Create a colormap based on cluster sizes
92
+ colormap = plt.cm.get_cmap("viridis", len(unique_clusters))
93
+
94
+ # Set up Streamlit app
95
+
96
+ fig, ax = plt.subplots()
97
+ for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
98
+ xs = np.array(x)[cluster_labels == category]
99
+ ys = np.array(y)[cluster_labels == category]
100
+
101
+ ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')
102
+
103
+ avg_x = xs.mean()
104
+ avg_y = ys.mean()
105
+
106
+ ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
107
+
108
+ ax.set_title("Clusters identified visualized in language 2D using t-SNE")
109
+ ax.legend()
110
+
111
+ # Display the plot in Streamlit
112
+ st.pyplot(fig)
113
+ st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))
114
+
115
+
116
+
117
+ # Reading a review which belong to each group.
118
+ rev_per_cluster = 3
119
+ n_clusters = len(np.unique(cluster_labels.tolist()))
120
+
121
+ for i in range(n_clusters):
122
+ print(f"Cluster {i} Theme:", end=" ")
123
+
124
+ reviews = "\n".join(
125
+ df[df.Cluster == i]
126
+ .text.str.replace("Title: ", "")
127
+ .str.replace("\n\nContent: ", ": ")
128
+ .sample(rev_per_cluster, random_state=42)
129
+ .values
130
+ )
131
+
132
+ messages = [
133
+ {"role": "user", "content": f'What do the following have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
134
+ ]
135
+
136
+ response = client.chat.completions.create(
137
+ model="gpt-4",
138
+ messages=messages,
139
+ temperature=0,
140
+ max_tokens=64,
141
+ top_p=1,
142
+ frequency_penalty=0,
143
+ presence_penalty=0)
144
+ print(response.choices[0].message.content.replace("\n", ""))
145
+ st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))
146
+
147
+ # sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
148
+ # for j in range(rev_per_cluster):
149
+ # print(sample_cluster_rows.Score.values[j], end=", ")
150
+ # print(sample_cluster_rows.Summary.values[j], end=": ")
151
+ # print(sample_cluster_rows.Text.str[:70].values[j])
152
+
153
+ # print("-" * 100)
154
+ #