Sebbe33 commited on
Commit
e4f69cf
Β·
verified Β·
1 Parent(s): d97111e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -21
app.py CHANGED
@@ -1,38 +1,72 @@
1
  import streamlit as st
2
  import google.generativeai as genai
 
3
 
4
  # Configure Gemini API
5
  genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
6
 
7
- st.title("Embedding Test")
8
 
9
- # Text input area
10
- input_text = st.text_area("Enter your text to generate embedding:",
11
- height=150,
12
- placeholder="Type your text here...")
13
 
14
- # Button to generate embedding
15
- if st.button("Generate Embedding"):
16
- if not input_text.strip():
17
- st.warning("Please enter some text to generate embedding.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  else:
19
- with st.spinner("Generating embedding..."):
20
  try:
21
- # Generate embedding
22
- result = genai.embed_content(
23
- model="models/text-embedding-004",
24
- content=input_text
25
- )
 
 
 
 
 
26
 
27
- embedding = result['embedding']
 
 
 
28
 
29
  # Display results
 
 
30
 
31
- st.subheader("πŸ“Œ Generated Embedding")
32
- st.write(f"**Embedding Dimension:** {len(embedding)}")
33
- st.code(str(embedding))
34
 
35
- st.success("Embedding generated successfully!")
 
 
 
36
 
37
  except Exception as e:
38
- st.error(f"Error generating embedding: {str(e)}")
 
1
  import streamlit as st
2
  import google.generativeai as genai
3
+ import numpy as np
4
 
5
  # Configure Gemini API
6
  genai.configure(api_key=st.secrets["GEMINI_API_KEY"])
7
 
8
+ st.title("Text Embedding Similarity Test")
9
 
10
+ def split_into_chunks(text, chunk_size=500):
11
+ """Split text into chunks of approximately specified character length"""
12
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
 
13
 
14
+ def get_embedding(text):
15
+ """Get embedding for a single text chunk"""
16
+ return genai.embed_content(
17
+ model="models/text-embedding-004",
18
+ content=text
19
+ )['embedding']
20
+
21
+ def cosine_similarity(vec1, vec2):
22
+ """Compute cosine similarity between two vectors"""
23
+ return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
24
+
25
+ # Text input areas
26
+ col1, col2 = st.columns(2)
27
+ with col1:
28
+ input_text1 = st.text_area("Enter your first text:",
29
+ height=200,
30
+ placeholder="Type or paste your first text here...")
31
+
32
+ with col2:
33
+ input_text2 = st.text_area("Enter text to compare:",
34
+ height=200,
35
+ placeholder="Type or paste text to compare...")
36
+
37
+ if st.button("Run Similarity Test"):
38
+ if not input_text1.strip() or not input_text2.strip():
39
+ st.warning("Please enter text in both input fields.")
40
  else:
41
+ with st.spinner("Analyzing texts..."):
42
  try:
43
+ # Process first text into chunks
44
+ chunks = split_into_chunks(input_text1)
45
+ if len(chunks) > 1:
46
+ st.info(f"Split first text into {len(chunks)} chunks")
47
+
48
+ # Generate embeddings for all chunks
49
+ embeddings = [get_embedding(chunk) for chunk in chunks]
50
+
51
+ # Generate embedding for comparison text
52
+ compare_embedding = get_embedding(input_text2)
53
 
54
+ # Calculate similarities
55
+ similarities = [cosine_similarity(emb, compare_embedding) for emb in embeddings]
56
+ max_score = max(similarities)
57
+ max_index = similarities.index(max_score)
58
 
59
  # Display results
60
+ st.subheader("πŸ“Š Similarity Results")
61
+ st.write(f"**Highest similarity score:** {max_score:.4f}")
62
 
63
+ st.subheader("🧩 Most Similar Chunk")
64
+ st.write(chunks[max_index])
 
65
 
66
+ st.subheader("πŸ“ˆ All Chunk Similarities")
67
+ for i, (chunk, score) in enumerate(zip(chunks, similarities)):
68
+ st.write(f"Chunk {i+1} ({len(chunk)} chars): {score:.4f}")
69
+ st.expander(f"View chunk {i+1}").write(chunk)
70
 
71
  except Exception as e:
72
+ st.error(f"Error processing texts: {str(e)}")