DataAIDemo

Sleeping

App Files Files Community

themeetjani commited on Feb 26, 2024

Commit

1ef3d70

verified ·

1 Parent(s): 5dc77e3

Upload 3 files

Browse files

Files changed (3) hide show

pages/cg.py +50 -0
pages/sp.py +15 -0
pages/tc.py +154 -0

pages/cg.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import streamlit as st
+from streamlit import session_state
+import os
+import openai
+openai.api_key = os.getenv("OPENAI_API_KEY")
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+import numpy as np
+def gpt4_score(schema, query):
+    response = openai.ChatCompletion.create(
+      model="gpt-4",
+      messages=[
+    {
+      "role": "system",
+      "content": "You are Code generator assistant. your task is to generate a code/query based on the instructions given in any language.\nif it's sql then accept query instruction also.\n\n<<REMEMBER>> Give only code or query. Don't provide any extra information.\n\n<<OUTPUT>>"
+    },
+    {
+      "role": "user",
+      "content": f"Schema/ Detail: {schema}"
+    },
+    {
+      "role": "user",
+      "content": f"Query/instruction: {query}"
+    }
+  ],
+      temperature=0.7,
+      max_tokens=701,
+      top_p=1,
+      frequency_penalty=0,
+      presence_penalty=0
+    )
+    return response.choices[0].message.content
+st.write("# Auto Code Generation! 👋")
+if 'score' not in session_state:
+    session_state['score']= ""
+text1= st.text_area(label= "Please write the Schema or Detailed explaination bellow",
+              placeholder="What does the text say?")
+text2= st.text_area(label= "Please write the Query or code instructions bellow",
+              placeholder="What does the text say?")
+def classify(text1,text2):
+    session_state['score'] = gpt4_score(text1,text2)
+st.text_area("result", value=session_state['score'])
+st.button("Classify", on_click=classify, args=[text1,text2])

pages/sp.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import streamlit as st
+import streamlit.components.v1 as components
+# bootstrap 4 collapse example
+components.html(
+    """
+    <script
+	type="module"
+	src="https://gradio.s3-us-west-2.amazonaws.com/3.39.0/gradio.js"
+></script>
+<gradio-app src="https://themeetjani-speech2.hf.space"></gradio-app>
+    """,
+    height=1600,
+)

pages/tc.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import streamlit as st
+from streamlit import session_state
+import numpy as np
+import json
+from io import StringIO
+import openai
+import json
+import os
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import nltk
+from nltk import word_tokenize
+from nltk.corpus import stopwords
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
+from sklearn.cluster import AgglomerativeClustering,k_means
+from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
+import numpy as np
+nltk.download("stopwords")
+import nltk
+nltk.download('punkt')
+#text preprocessing function
+def clean_text_1(text):
+    stop_words = set(stopwords.words("english"))
+    def remove_stopwords(text):
+      return " ".join([word for word in str(text).split() if word not in stop_words])
+    text = remove_stopwords(text)
+    text = str(text).lower()  # Lowercase words
+    text = re.sub(r"\[(.*?)\]", " ", text)  # Remove [+XYZ chars] in content
+    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces in content
+    text = re.sub(r"\w+…|…", " ", text)  # Remove ellipsis (and last word)
+    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # Replace dash between words
+    # text = re.sub(stop_words, " ", text)  # Replace dash between words
+    text = re.sub(
+        f"[{re.escape(string.punctuation)}]", "", text
+    )  # Remove punctuation
+    return text
+import streamlit as st
+import pandas as pd
+import numpy as np
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #calling hugging face model for embeddings here
+from openai import OpenAI
+client = OpenAI()
+# Load sentence transformer model
+def get_embedding(text):
+    # Assuming you have a function clean_text_1 to clean the text
+    #text = clean_text_1(text)
+    return model.encode(text)
+# Streamlit UI configuration
+st.set_page_config(
+    page_title="text_clustering.py",
+    page_icon="👋",
+)
+# Upload file
+uploaded_file = st.file_uploader("Choose a file")
+if uploaded_file:
+    # Read data from file
+    df = pd.read_csv(uploaded_file)
+    # Clean data
+    df = df[df['text'].notna()].reset_index(drop=True)
+    # Get embeddings
+    df['embedding'] = df['text'].apply(get_embedding)
+    matrix = np.vstack(df['embedding'].values)
+    # Distance threshold slider
+    distance_threshold = st.slider("Select Distance Threshold", min_value=0.1, max_value=2.0, value=1.1, step=0.1)
+    # Perform clustering
+    agg_clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=distance_threshold, linkage='ward')
+    cluster_labels = agg_clustering.fit_predict(matrix)
+    df['Cluster'] = cluster_labels
+    # Visualize clusters with t-SNE
+    tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)
+    vis_dims2 = tsne.fit_transform(matrix)
+    x = [x for x, y in vis_dims2]
+    y = [y for x, y in vis_dims2]
+    unique_clusters, cluster_counts = np.unique(cluster_labels, return_counts=True)
+    # Create a colormap based on cluster sizes
+    colormap = plt.cm.get_cmap("viridis", len(unique_clusters))
+    # Set up Streamlit app
+    fig, ax = plt.subplots()
+    for category, (color, size) in enumerate(zip(colormap.colors, cluster_counts)):
+        xs = np.array(x)[cluster_labels == category]
+        ys = np.array(y)[cluster_labels == category]
+        ax.scatter(xs, ys, color=color, alpha=0.3, label=f'Cluster {category} (Size: {size})')
+        avg_x = xs.mean()
+        avg_y = ys.mean()
+        ax.scatter(avg_x, avg_y, marker="x", color=color, s=100)
+    ax.set_title("Clusters identified visualized in language 2D using t-SNE")
+    ax.legend()
+    # Display the plot in Streamlit
+    st.pyplot(fig)
+    st.text_area("Number of Cluster Labels", value=len(np.unique(cluster_labels.tolist())))
+    # Reading a review which belong to each group.
+    rev_per_cluster = 3
+    n_clusters = len(np.unique(cluster_labels.tolist()))
+    for i in range(n_clusters):
+        print(f"Cluster {i} Theme:", end=" ")
+        reviews = "\n".join(
+            df[df.Cluster == i]
+            .text.str.replace("Title: ", "")
+            .str.replace("\n\nContent: ", ":  ")
+            .sample(rev_per_cluster, random_state=42)
+            .values
+        )
+        messages = [
+            {"role": "user", "content": f'What do the following  have in common?\n\nValues:\n"""\n{reviews}\n"""\n\nTheme:'}
+        ]
+        response = client.chat.completions.create(
+            model="gpt-4",
+            messages=messages,
+            temperature=0,
+            max_tokens=64,
+            top_p=1,
+            frequency_penalty=0,
+            presence_penalty=0)
+        print(response.choices[0].message.content.replace("\n", ""))
+        st.text_area(f"Cluster {i} Theme", value=response.choices[0].message.content.replace("\n", ""))
+#         sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)
+#         for j in range(rev_per_cluster):
+#             print(sample_cluster_rows.Score.values[j], end=", ")
+#             print(sample_cluster_rows.Summary.values[j], end=":   ")
+#             print(sample_cluster_rows.Text.str[:70].values[j])
+#         print("-" * 100)
+#