TomData commited on
Commit
e681b03
1 Parent(s): 0d7e513

update keyword search

Browse files
Files changed (2) hide show
  1. Home.py +1 -1
  2. src/chatbot.py +9 -6
Home.py CHANGED
@@ -24,7 +24,7 @@ with gr.Blocks() as App:
24
  #Row orientation
25
  with gr.Row() as additional_input:
26
  n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
27
- party_dopdown = gr.Dropdown(choices=['CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party')
28
 
29
  search_btn = gr.Button('Search')
30
 
 
24
  #Row orientation
25
  with gr.Row() as additional_input:
26
  n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
27
+ party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') #change to all possible options
28
 
29
  search_btn = gr.Button('Search')
30
 
src/chatbot.py CHANGED
@@ -61,15 +61,18 @@ def chatbot(message, history, db=db, llm=llm, prompt=prompt2):
61
  return response
62
 
63
  # Retrieve speech contents based on keywords
64
- def keyword_search(query,n=10, db=db, embeddings=embeddings, method='ss', party_filter = ''):
65
  query_embedding = embeddings.embed_query(query)
66
  if method == 'mmr':
67
  df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance']) # Add Date/Party/Politician
68
- results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k = n, fetch_k = n + 10) #Add filter
69
  for doc in results:
 
 
 
 
70
  speech_content = doc[0].page_content
71
  speech_date = doc[0].metadata["date"]
72
- party = doc[0].metadata["party"]
73
  score = round(doc[1], ndigits=2) # Relevance based on relevance search
74
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
75
  'Date': [speech_date],
@@ -78,12 +81,12 @@ def keyword_search(query,n=10, db=db, embeddings=embeddings, method='ss', party_
78
  df_res.sort_values('Relevance', inplace=True, ascending=True)
79
  else:
80
  df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party']) # Add Date/Party/Politician #Add filter
81
- results = db.similarity_search_by_vector(query_embedding, k = n, filter={"party": party_filter})
82
  for doc in results:
83
  party = doc.metadata["party"]
84
  #Filter by party input
85
- #if party != party_filter or party_filter == '':
86
- # continue
87
  speech_content = doc.page_content
88
  speech_date = doc.metadata["date"]
89
 
 
61
  return response
62
 
63
  # Retrieve speech contents based on keywords
64
+ def keyword_search(query,n=10, db=db, embeddings=embeddings, method='ss', party_filter = 'All'):
65
  query_embedding = embeddings.embed_query(query)
66
  if method == 'mmr':
67
  df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance']) # Add Date/Party/Politician
68
+ results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k = n)
69
  for doc in results:
70
+ party = doc[0].metadata["party"]
71
+ #Filter by party input
72
+ if party != party_filter and party_filter != 'All':
73
+ continue
74
  speech_content = doc[0].page_content
75
  speech_date = doc[0].metadata["date"]
 
76
  score = round(doc[1], ndigits=2) # Relevance based on relevance search
77
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
78
  'Date': [speech_date],
 
81
  df_res.sort_values('Relevance', inplace=True, ascending=True)
82
  else:
83
  df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party']) # Add Date/Party/Politician #Add filter
84
+ results = db.similarity_search_by_vector(query_embedding, k = n)
85
  for doc in results:
86
  party = doc.metadata["party"]
87
  #Filter by party input
88
+ if party != party_filter and party_filter != 'All':
89
+ continue
90
  speech_content = doc.page_content
91
  speech_date = doc.metadata["date"]
92