File size: 5,704 Bytes
096a82e
 
 
9889763
68be317
096a82e
 
d40a755
68be317
cf494b2
096a82e
 
 
cf494b2
 
096a82e
 
845df70
68be317
3ed7c41
 
 
 
 
096a82e
3ed7c41
871af30
cf494b2
cba50c7
adbdb15
3ed7c41
cba50c7
68b08cf
871af30
68b08cf
cba50c7
 
096a82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8fb57f
096a82e
 
 
 
a8fb57f
 
 
 
 
 
 
 
 
 
b036a52
 
a8fb57f
 
 
 
 
 
096a82e
 
 
 
 
 
 
 
 
 
 
 
 
 
68be317
68b08cf
 
 
68be317
d40a755
68b08cf
 
cf494b2
 
871af30
 
cba50c7
9889763
cba50c7
adbdb15
 
 
9889763
adbdb15
 
cba50c7
68b08cf
096a82e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8fb57f
096a82e
a8fb57f
 
 
 
 
 
096a82e
 
68b08cf
2e3d9b7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import pyterrier as pt
pt.init()
import numpy as np
import pandas as pd
import gradio as gr
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D

MODEL = 'macavaney/doc2query-t5-base-msmarco'
SCORE_MODEL = 'crystina-z/monoELECTRA_LCE_nneg31'
PERCENTILES_BY_5 = np.array([-3.80468750e+00, -2.21679688e+00, -1.25683594e+00, -5.58105469e-01, -7.65323639e-04, 4.69482422e-01, 8.83300781e-01, 1.25878906e+00, 1.61035156e+00, 1.94335938e+00, 2.26562500e+00, 2.58007812e+00, 2.89648438e+00, 3.21484375e+00, 3.54687500e+00, 3.90039062e+00, 4.30078125e+00, 4.77343750e+00, 5.37109375e+00])
COLORS = ['rgb(252, 132, 100)','rgb(252, 148, 116)','rgb(252, 166, 137)','rgb(252, 183, 156)','rgb(253, 200, 178)','rgb(254, 215, 198)','rgb(255, 228, 216)','rgb(255, 237, 228)','rgb(256, 245, 240)','rgb(256, 256, 256)','rgb(247, 252, 245)','rgb(240, 250, 237)','rgb(233, 247, 228)','rgb(222, 242, 216)','rgb(209, 237, 203)','rgb(195, 232, 188)','rgb(180, 225, 173)','rgb(163, 218, 157)','rgb(145, 210, 142)','rgb(125, 201, 126)']

doc2query = Doc2Query(MODEL, append=True, num_samples=5)
electra = ElectraScorer()
query_scorer = QueryScorer(electra)
query_filter = QueryFilter(t=0, append=False)

COLAB_NAME = 'pyterrier_doc2query.ipynb'
COLAB_INSTALL = '''
!pip install -q git+https://github.com/terrier-org/pyterrier
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
'''.strip()
COLAB_INSTALL_MM = COLAB_INSTALL + '\n!pip install -q git+https://github.com/terrierteam/pyterrier_dr faiss-cpu'

def predict(input, model, append, num_samples):
  assert model == MODEL
  doc2query.append = append
  doc2query.num_samples = num_samples
  code = f'''import pandas as pd
from pyterrier_doc2query import Doc2Query

doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})

doc2query({df2code(input)})
'''
  res = doc2query(input)
  vis = generate_vis(res)
  return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME), vis)

def generate_vis(df):
  result = []
  for row in df.itertuples(index=False):
    qs = []
    if hasattr(row, 'querygen_score'):
      for q, score in zip(row.querygen.split('\n'), row.querygen_score):
        bucket = np.searchsorted(PERCENTILES_BY_5, score)
        color = COLORS[bucket]
        percentile = bucket * 5
        qs.append(f'''
<div>
<span title="score={score:.4f}, in the {percentile}th percentile of scores" style="border: 1px solid #888; border-radius: 3px; font-size: 0.6em; font-family: monospace; background-color: {color}; padding: 1px 3px;">{percentile}th</span> {q}
</div>
''')
    elif hasattr(row, 'querygen'):
      for q in row.querygen.split('\n'):
        qs.append(f'''
<div>{q}</div>
''')
    qs = '\n'.join(qs)
    if qs:
      qs = f'''
<div><strong>Expansion Queries:</strong></div>
{qs}
'''
    text = row.text.replace('\n', '<br/>')
    result.append(f'''
<div style="font-size: 1.2em;">Document: <strong>{row.docno}</strong></div>
<div style="margin: 4px 0 16px; padding: 4px; border: 1px solid black;">
<div>
{text}
</div>
{qs}
</div>
''')
  return '\n'.join(result)

def predict_mm(input, model, num_samples, score_model, filter_pct):
  assert model == MODEL
  assert score_model == SCORE_MODEL
  doc2query.append = False
  doc2query.num_samples = num_samples
  if filter_pct > 0:
    query_filter.t = PERCENTILES_BY_5[filter_pct//5-1]
    pipeline = doc2query >> query_scorer >> query_filter
    code = f'''import pyterrier as pt ; pt.init()
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter
from pyterrier_dr import ElectraScorer

doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
scorer = ElectraScorer({repr(score_model)})
pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=False, t={query_filter.t})
# use append=True when indexing; t={query_filter.t} is the {filter_pct}th percentile for generated queries on MS MARCO

pipeline({df2code(input)})
'''
  else:
    pipeline = doc2query >> query_scorer
    code = f'''import pyterrier as pt ; pt.init()
import pandas as pd
from pyterrier_doc2query import Doc2Query, QueryScorer
from pyterrier_dr import ElectraScorer

doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples})
scorer = ElectraScorer({repr(score_model)})
pipeline = doc2query >> QueryScorer(scorer)

pipeline({df2code(input)})
'''
  res = pipeline(input)
  vis = generate_vis(res)
  res['querygen_score'] = res['querygen_score'].apply(lambda x: '[ ' + ', '.join(str(v) for v in x) + ' ]')
  return (res, code2md(code, COLAB_INSTALL_MM, COLAB_NAME), vis)

interface(
  MarkdownFile('README.md'),
  Demo(
    predict,
    EX_D,
    [
    gr.Dropdown(
      choices=[MODEL],
      value=MODEL,
      label='Model',
      interactive=False,
    ), gr.Checkbox(
      value=doc2query.append,
      label="Append",
    ), gr.Slider(
      minimum=1,
      maximum=10,
      value=doc2query.num_samples,
      step=1.,
      label='# Queries'
    )],
  ),
  MarkdownFile('mm.md'),
  Demo(
    predict_mm,
    EX_D,
    [
    gr.Dropdown(
      choices=[MODEL],
      value=MODEL,
      label='Model',
      interactive=False,
    ), gr.Slider(
      minimum=1,
      maximum=10,
      value=doc2query.num_samples,
      step=1.,
      label='# Queries'
    ), gr.Dropdown(
      choices=[SCORE_MODEL],
      value=SCORE_MODEL,
      label='Scorer',
      interactive=False,
    ), gr.Slider(
      minimum=0,
      maximum=95,
      value=10,
      step=5,
      label='Filter (top % of queries)'
    )],
  ),
  MarkdownFile('wrapup.md'),
).launch(share=False)