mrchtr commited on
Commit
8bd9363
1 Parent(s): 181e8c5

Add dutch partisan news dataset

Browse files
.gitattributes CHANGED
@@ -33,3 +33,18 @@ adapted-retriever/sentence_bert_config.json filter=lfs diff=lfs merge=lfs -text
33
  adapted-retriever/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
34
  adapted-retriever/tokenizer.json filter=lfs diff=lfs merge=lfs -text
35
  adapted-retriever/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  adapted-retriever/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
34
  adapted-retriever/tokenizer.json filter=lfs diff=lfs merge=lfs -text
35
  adapted-retriever/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
36
+ documentstore_german-election-idx_adapted.pkl filter=lfs diff=lfs merge=lfs -text
37
+ dutch-article-idx_adapted.pkl filter=lfs diff=lfs merge=lfs -text
38
+ dutch-article-retriever filter=lfs diff=lfs merge=lfs -text
39
+ dutch-article-idx.pkl filter=lfs diff=lfs merge=lfs -text
40
+ dutch-article-retriever/1_Pooling filter=lfs diff=lfs merge=lfs -text
41
+ dutch-article-retriever/README.md filter=lfs diff=lfs merge=lfs -text
42
+ dutch-article-retriever/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
43
+ dutch-article-retriever/sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
44
+ dutch-article-retriever/config.json filter=lfs diff=lfs merge=lfs -text
45
+ dutch-article-retriever/config_sentence_transformers.json filter=lfs diff=lfs merge=lfs -text
46
+ dutch-article-retriever/modules.json filter=lfs diff=lfs merge=lfs -text
47
+ dutch-article-retriever/sentence_bert_config.json filter=lfs diff=lfs merge=lfs -text
48
+ dutch-article-retriever/special_tokens_map.json filter=lfs diff=lfs merge=lfs -text
49
+ dutch-article-retriever/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ dutch-article-retriever/tokenizer_config.json filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -3,7 +3,8 @@
3
  Here's our first attempt at using data to create a table:
4
  """
5
  import streamlit as st
6
- from retriever import do_search
 
7
 
8
  def local_css(file_name):
9
  with open(file_name) as f:
@@ -16,7 +17,7 @@ def render_retrieved_content(content, score):
16
  if score is not None:
17
  score = round(score, 3)
18
  print_score = f'<b> Similarity Score: {score}</b>'
19
- return f'<blockquote>{content} </blockquote> {print_score}'
20
 
21
  local_css('style.css')
22
  st.header('🧐 Where my docs at?')
@@ -31,12 +32,12 @@ st.markdown('✨ Imagine you have a bunch of text documents and looking for one
31
  with st.form('search-input'):
32
  option = st.selectbox(
33
  'Choose a dataset',
34
- ('CDU election program 2021', 'Partisan news 2019 (dutch)'))
35
  search = st.text_input('Enter your search query')
36
  button = st.form_submit_button('Search')
37
 
38
  if search:
39
- result = do_search(search)
40
 
41
  st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
42
  st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
3
  Here's our first attempt at using data to create a table:
4
  """
5
  import streamlit as st
6
+ from retriever import do_search, dutch_datset_name, german_datset_name
7
+
8
 
9
  def local_css(file_name):
10
  with open(file_name) as f:
17
  if score is not None:
18
  score = round(score, 3)
19
  print_score = f'<b> Similarity Score: {score}</b>'
20
+ return f'<blockquote> {content} </blockquote> {print_score}'
21
 
22
  local_css('style.css')
23
  st.header('🧐 Where my docs at?')
32
  with st.form('search-input'):
33
  option = st.selectbox(
34
  'Choose a dataset',
35
+ (german_datset_name, dutch_datset_name))
36
  search = st.text_input('Enter your search query')
37
  button = st.form_submit_button('Search')
38
 
39
  if search:
40
+ result = do_search(search, option)
41
 
42
  st.markdown('### 🔎 Term Frequency–Inverse Document Frequency (TF-IDF)')
43
  st.markdown('Is a statistical approach that calculates how relevant a word is to a document '
dutch-article-idx.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b1edcffb6ca9c5409af117770d97415a119bcb02fc5c3ac338f82dadacdb51
3
+ size 24987947
dutch-article-idx_adapted.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed14bfd16fa49000673d7964bf90f3da854b3a17209554bc4ec6d1664f59858d
3
+ size 25239050
dutch-article-retriever/1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 768,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
dutch-article-retriever/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:112c56ba0758ca51e45cda7f0d505af643c740abd0af7f740ec411d30708a96d
3
+ size 3696
dutch-article-retriever/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29703b29b31e2dabfcd73e52ba0856489249af29f2c8fc5209415fccadfac0d3
3
+ size 821
dutch-article-retriever/config_sentence_transformers.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8c64b5cece00d8424b4896ea75b512b6008576088497609dfeb6bd63e6d36b8
3
+ size 122
dutch-article-retriever/modules.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f4b264b80206c830bebbdcae377e137925650a433b689343a63bdc9b3145460
3
+ size 229
dutch-article-retriever/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b906450207e003aaf2f08d775fedfb16b8438206899eb12a93f92059069ad8a
3
+ size 1112244081
dutch-article-retriever/sentence_bert_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec8e29d6dcb61b611b7d3fdd2982c4524e6ad985959fa7194eacfb655a8d0d51
3
+ size 53
dutch-article-retriever/sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
dutch-article-retriever/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:378eb3bf733eb16e65792d7e3fda5b8a4631387ca04d2015199c4d4f22ae554d
3
+ size 239
dutch-article-retriever/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46afe88da5fd71bdbab5cfab5e84c1adce59c246ea5f9341bbecef061891d0a7
3
+ size 17082913
dutch-article-retriever/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c84cba673d65cd6fabcaf0340ae8e57b34306e01862132f4b476936917727dea
3
+ size 483
retriever.py CHANGED
@@ -1,11 +1,12 @@
1
  from haystack.document_stores import InMemoryDocumentStore
2
- from haystack.utils import convert_files_to_docs
3
  from haystack.nodes.retriever import TfidfRetriever
4
  from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
5
  from haystack.nodes.retriever import EmbeddingRetriever
6
- from haystack.nodes import FARMReader
7
  import pickle
8
  from pprint import pprint
 
 
9
 
10
  class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
11
  """
@@ -22,54 +23,71 @@ class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
22
  self.indexes = pickle.load(f)
23
 
24
 
25
-
26
- document_store = ExportableInMemoryDocumentStore(similarity='cosine')
27
- document_store.load_data('documentstore_german-election-idx.pkl')
28
-
29
- document_store_adapted = ExportableInMemoryDocumentStore(similarity='cosine')
30
- document_store_adapted.load_data('documentstore_german-election-idx.pkl')
31
-
32
- retriever = TfidfRetriever(document_store=document_store)
33
- base_dense_retriever = EmbeddingRetriever(
34
- document_store=document_store,
35
- embedding_model='sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
36
- model_format='sentence_transformers'
37
- )
38
-
39
- fine_tuned_retriever = EmbeddingRetriever(
40
- document_store=document_store_adapted,
41
- embedding_model='./adapted-retriever',
42
- model_format='sentence_transformers'
43
- )
44
-
45
- def sparse_retrieval(query):
46
- """Sparse retrieval pipeline"""
47
- scores = retriever._calc_scores(query)
48
- p_retrieval = DocumentSearchPipeline(retriever)
49
- documents = p_retrieval.run(query=query)
50
- documents['documents'][0].score = list(scores[0].values())[0]
51
- return documents
52
-
53
- def dense_retrieval(query, retriever='base'):
54
- if retriever == 'base':
55
- p_retrieval = DocumentSearchPipeline(base_dense_retriever)
56
- elif retriever == 'adapted':
57
- p_retrieval = DocumentSearchPipeline(fine_tuned_retriever)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  else:
59
- return None
60
-
61
- return p_retrieval.run(query=query)
62
-
63
-
64
- def do_search(query):
65
- sparse_result = sparse_retrieval(query)['documents'][0]
66
- dense_base_result =dense_retrieval(query, retriever='base')['documents'][0]
67
- dense_adapted_result = dense_retrieval(query, retriever='adapted')['documents'][0]
68
- return sparse_result, dense_base_result, dense_adapted_result
69
 
70
  if __name__ == '__main__':
71
- query = 'Frauen'
72
- result = do_search(query)
 
 
 
73
  pprint(result)
74
 
75
 
1
  from haystack.document_stores import InMemoryDocumentStore
2
+
3
  from haystack.nodes.retriever import TfidfRetriever
4
  from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
5
  from haystack.nodes.retriever import EmbeddingRetriever
 
6
  import pickle
7
  from pprint import pprint
8
+ dutch_datset_name = 'Partisan news 2019 (dutch)'
9
+ german_datset_name = 'CDU election program 2021'
10
 
11
  class ExportableInMemoryDocumentStore(InMemoryDocumentStore):
12
  """
23
  self.indexes = pickle.load(f)
24
 
25
 
26
+ class SearchEngine():
27
+
28
+ def __init__(self, document_store_name_base, document_store_name_adpated,
29
+ adapted_retriever_path):
30
+ self.document_store = ExportableInMemoryDocumentStore(similarity='cosine')
31
+ self.document_store.load_data(document_store_name_base)
32
+
33
+ self.document_store_adapted = ExportableInMemoryDocumentStore(similarity='cosine')
34
+ self.document_store_adapted.load_data(document_store_name_adpated)
35
+
36
+ self.retriever = TfidfRetriever(document_store=self.document_store)
37
+
38
+ self.base_dense_retriever = EmbeddingRetriever(
39
+ document_store=self.document_store,
40
+ embedding_model='sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
41
+ model_format='sentence_transformers'
42
+ )
43
+
44
+ self.fine_tuned_retriever = EmbeddingRetriever(
45
+ document_store=self.document_store_adapted,
46
+ embedding_model=adapted_retriever_path,
47
+ model_format='sentence_transformers'
48
+ )
49
+
50
+ def sparse_retrieval(self, query):
51
+ """Sparse retrieval pipeline"""
52
+ scores = self.retriever._calc_scores(query)
53
+ p_retrieval = DocumentSearchPipeline(self.retriever)
54
+ documents = p_retrieval.run(query=query)
55
+ documents['documents'][0].score = list(scores[0].values())[0]
56
+ return documents
57
+
58
+ def dense_retrieval(self, query, retriever='base'):
59
+ if retriever == 'base':
60
+ p_retrieval = DocumentSearchPipeline(self.base_dense_retriever)
61
+ return p_retrieval.run(query=query)
62
+ if retriever == 'adapted':
63
+ p_retrieval = DocumentSearchPipeline(self.fine_tuned_retriever)
64
+ return p_retrieval.run(query=query)
65
+
66
+ def do_search(self, query):
67
+ sparse_result = self.sparse_retrieval(query)['documents'][0]
68
+ dense_base_result = self.dense_retrieval(query, 'base')['documents'][0]
69
+ dense_adapted_result = self.dense_retrieval(query, 'adapted')['documents'][0]
70
+ return sparse_result, dense_base_result, dense_adapted_result
71
+
72
+
73
+ dutch_search_engine = SearchEngine('dutch-article-idx.pkl', 'dutch-article-idx_adapted.pkl',
74
+ 'dutch-article-retriever')
75
+ german_search_engine = SearchEngine('documentstore_german-election-idx.pkl',
76
+ 'documentstore_german-election-idx_adapted.pkl',
77
+ 'adapted-retriever')
78
+
79
+ def do_search(query, dataset):
80
+ if dataset == german_datset_name:
81
+ return german_search_engine.do_search(query)
82
  else:
83
+ return dutch_search_engine.do_search(query)
 
 
 
 
 
 
 
 
 
84
 
85
  if __name__ == '__main__':
86
+ search_engine = SearchEngine('dutch-article-idx.pkl', 'dutch-article-idx_adapted.pkl',
87
+ 'dutch-article-retriever')
88
+ query = 'Kindergarten'
89
+
90
+ result = search_engine.do_search(query)
91
  pprint(result)
92
 
93