Spaces:
Runtime error
Runtime error
import pyterrier as pt | |
pt.init() | |
import numpy as np | |
import pandas as pd | |
import gradio as gr | |
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter | |
from pyterrier_dr import ElectraScorer | |
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_D | |
MODEL = 'macavaney/doc2query-t5-base-msmarco' | |
SCORE_MODEL = 'crystina-z/monoELECTRA_LCE_nneg31' | |
PERCENTILES_BY_5 = np.array([-3.80468750e+00, -2.21679688e+00, -1.25683594e+00, -5.58105469e-01, -7.65323639e-04, 4.69482422e-01, 8.83300781e-01, 1.25878906e+00, 1.61035156e+00, 1.94335938e+00, 2.26562500e+00, 2.58007812e+00, 2.89648438e+00, 3.21484375e+00, 3.54687500e+00, 3.90039062e+00, 4.30078125e+00, 4.77343750e+00, 5.37109375e+00]) | |
COLORS = ['rgb(252, 132, 100)','rgb(252, 148, 116)','rgb(252, 166, 137)','rgb(252, 183, 156)','rgb(253, 200, 178)','rgb(254, 215, 198)','rgb(255, 228, 216)','rgb(255, 237, 228)','rgb(256, 245, 240)','rgb(256, 256, 256)','rgb(247, 252, 245)','rgb(240, 250, 237)','rgb(233, 247, 228)','rgb(222, 242, 216)','rgb(209, 237, 203)','rgb(195, 232, 188)','rgb(180, 225, 173)','rgb(163, 218, 157)','rgb(145, 210, 142)','rgb(125, 201, 126)'] | |
doc2query = Doc2Query(MODEL, append=True, num_samples=5) | |
electra = ElectraScorer() | |
query_scorer = QueryScorer(electra) | |
query_filter = QueryFilter(t=0, append=False) | |
COLAB_NAME = 'pyterrier_doc2query.ipynb' | |
COLAB_INSTALL = ''' | |
!pip install -q git+https://github.com/terrier-org/pyterrier | |
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query | |
'''.strip() | |
COLAB_INSTALL_MM = COLAB_INSTALL + '\n!pip install -q git+https://github.com/terrierteam/pyterrier_dr faiss-cpu' | |
def predict(input, model, append, num_samples): | |
assert model == MODEL | |
doc2query.append = append | |
doc2query.num_samples = num_samples | |
code = f'''import pandas as pd | |
from pyterrier_doc2query import Doc2Query | |
doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples}) | |
doc2query({df2code(input)}) | |
''' | |
res = doc2query(input) | |
vis = generate_vis(res) | |
return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME), vis) | |
def generate_vis(df): | |
result = [] | |
for row in df.itertuples(index=False): | |
qs = [] | |
if hasattr(row, 'querygen_score'): | |
for q, score in zip(row.querygen.split('\n'), row.querygen_score): | |
bucket = np.searchsorted(PERCENTILES_BY_5, score) | |
color = COLORS[bucket] | |
percentile = bucket * 5 | |
qs.append(f''' | |
<div> | |
<span title="score={score:.4f}, in the {percentile}th percentile of scores" style="border: 1px solid #888; border-radius: 3px; font-size: 0.6em; font-family: monospace; background-color: {color}; padding: 1px 3px;">{percentile}th</span> {q} | |
</div> | |
''') | |
elif hasattr(row, 'querygen'): | |
for q in row.querygen.split('\n'): | |
qs.append(f''' | |
<div>{q}</div> | |
''') | |
qs = '\n'.join(qs) | |
if qs: | |
qs = f''' | |
<div><strong>Expansion Queries:</strong></div> | |
{qs} | |
''' | |
text = row.text.replace('\n', '<br/>') | |
result.append(f''' | |
<div style="font-size: 1.2em;">Document: <strong>{row.docno}</strong></div> | |
<div style="margin: 4px 0 16px; padding: 4px; border: 1px solid black;"> | |
<div> | |
{text} | |
</div> | |
{qs} | |
</div> | |
''') | |
return '\n'.join(result) | |
def predict_mm(input, model, num_samples, score_model, filter_pct): | |
assert model == MODEL | |
assert score_model == SCORE_MODEL | |
doc2query.append = False | |
doc2query.num_samples = num_samples | |
if filter_pct > 0: | |
query_filter.t = PERCENTILES_BY_5[filter_pct//5-1] | |
pipeline = doc2query >> query_scorer >> query_filter | |
code = f'''import pyterrier as pt ; pt.init() | |
import pandas as pd | |
from pyterrier_doc2query import Doc2Query, QueryScorer, QueryFilter | |
from pyterrier_dr import ElectraScorer | |
doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples}) | |
scorer = ElectraScorer({repr(score_model)}) | |
pipeline = doc2query >> QueryScorer(scorer) >> QueryFilter(append=False, t={query_filter.t}) | |
# use append=True when indexing; t={query_filter.t} is the {filter_pct}th percentile for generated queries on MS MARCO | |
pipeline({df2code(input)}) | |
''' | |
else: | |
pipeline = doc2query >> query_scorer | |
code = f'''import pyterrier as pt ; pt.init() | |
import pandas as pd | |
from pyterrier_doc2query import Doc2Query, QueryScorer | |
from pyterrier_dr import ElectraScorer | |
doc2query = Doc2Query({repr(model)}, append=False, num_samples={num_samples}) | |
scorer = ElectraScorer({repr(score_model)}) | |
pipeline = doc2query >> QueryScorer(scorer) | |
pipeline({df2code(input)}) | |
''' | |
res = pipeline(input) | |
vis = generate_vis(res) | |
res['querygen_score'] = res['querygen_score'].apply(lambda x: '[ ' + ', '.join(str(v) for v in x) + ' ]') | |
return (res, code2md(code, COLAB_INSTALL_MM, COLAB_NAME), vis) | |
interface( | |
MarkdownFile('README.md'), | |
Demo( | |
predict, | |
EX_D, | |
[ | |
gr.Dropdown( | |
choices=[MODEL], | |
value=MODEL, | |
label='Model', | |
interactive=False, | |
), gr.Checkbox( | |
value=doc2query.append, | |
label="Append", | |
), gr.Slider( | |
minimum=1, | |
maximum=10, | |
value=doc2query.num_samples, | |
step=1., | |
label='# Queries' | |
)], | |
), | |
MarkdownFile('mm.md'), | |
Demo( | |
predict_mm, | |
EX_D, | |
[ | |
gr.Dropdown( | |
choices=[MODEL], | |
value=MODEL, | |
label='Model', | |
interactive=False, | |
), gr.Slider( | |
minimum=1, | |
maximum=10, | |
value=doc2query.num_samples, | |
step=1., | |
label='# Queries' | |
), gr.Dropdown( | |
choices=[SCORE_MODEL], | |
value=SCORE_MODEL, | |
label='Scorer', | |
interactive=False, | |
), gr.Slider( | |
minimum=0, | |
maximum=95, | |
value=10, | |
step=5, | |
label='Filter (top % of queries)' | |
)], | |
), | |
MarkdownFile('wrapup.md'), | |
).launch(share=False) | |