GGroenendaal commited on
Commit
350ba75
1 Parent(s): 64df40b

add plots for reading times

Browse files
plots.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import scipy.stats as stats
5
+
6
+ data = pd.read_csv("results/timings.csv", index_col="Unnamed: 0")
7
+ data
8
+ # %%
9
+ data.columns
10
+ # %%
11
+
12
+ data_retrieve = data[["faiss_dpr.retrieve", "faiss_longformer.retrieve",
13
+ "es_dpr.retrieve", "es_longformer.retrieve"]]
14
+
15
+ # %%
16
+ plt.title("Retrieval time")
17
+ plt.ylabel("Time (s)")
18
+ plt.xlabel("Model")
19
+ plt.boxplot(data_retrieve, labels=[
20
+ "A1", "A2", "B1", "B2"])
21
+ plt.savefig("results/retrieval_time.png")
22
+
23
+ # %%
24
+ print(data_retrieve.describe())
25
+
26
+ with open("results/retrieval_time.tex", "w") as f:
27
+ f.write(data_retrieve.describe().to_latex())
28
+
29
+ # %%
30
+
31
+ # now the same for the reader
32
+ data_read = data[["faiss_dpr.read", "faiss_longformer.read",
33
+ "es_dpr.read", "es_longformer.read"]]
34
+
35
+ plt.title("Reading time")
36
+ plt.ylabel("Time (s)")
37
+ plt.xlabel("Model")
38
+ plt.boxplot(data_read, labels=["A1", "A2", "B1", "B2"])
39
+ plt.savefig("results/read_time.png")
40
+
41
+ # %%
42
+ print(data_read.describe())
43
+
44
+ with open("results/read_time.tex", "w") as f:
45
+ f.write(data_read.describe().to_latex())
46
+
47
+
48
+ # Statistical tests for reading time
49
+
50
+ # %%
51
+ stats.probplot(data_retrieve["es_longformer.retrieve"], dist="norm", plot=plt)
52
+ # %%
53
+
54
+
55
+ # %%
56
+ anova_retrieve = stats.f_oneway(*data_retrieve.T.values)
57
+ anova_read = stats.f_oneway(*data_read.T.values)
58
+
59
+ print(f"retrieve\n {anova_retrieve} \n\nread\n {anova_read}")
60
+
61
+ # %%
results/read_time.png ADDED
results/read_time.tex ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \begin{tabular}{lrrrr}
2
+ \toprule
3
+ {} & faiss\_dpr.read & faiss\_longformer.read & es\_dpr.read & es\_longformer.read \\
4
+ \midrule
5
+ count & 59.000000 & 59.000000 & 59.000000 & 59.000000 \\
6
+ mean & 1.222466 & 5.486930 & 1.866525 & 5.191112 \\
7
+ std & 0.923501 & 0.966157 & 1.005673 & 0.465743 \\
8
+ min & 0.341175 & 4.487846 & 0.314589 & 4.463429 \\
9
+ 25\% & 0.695762 & 4.767350 & 1.141979 & 4.858446 \\
10
+ 50\% & 0.919248 & 5.454382 & 1.650235 & 5.202449 \\
11
+ 75\% & 1.394425 & 5.699257 & 2.516944 & 5.362522 \\
12
+ max & 5.365102 & 10.146074 & 4.782422 & 6.431236 \\
13
+ \bottomrule
14
+ \end{tabular}
results/retrieval_time.png ADDED
results/retrieval_time.tex ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \begin{tabular}{lrrrr}
2
+ \toprule
3
+ {} & faiss\_dpr.retrieve & faiss\_longformer.retrieve & es\_dpr.retrieve & es\_longformer.retrieve \\
4
+ \midrule
5
+ count & 59.000000 & 59.000000 & 59.000000 & 59.000000 \\
6
+ mean & 0.056994 & 0.854546 & 0.013451 & 0.013016 \\
7
+ std & 0.038737 & 0.165768 & 0.003771 & 0.002781 \\
8
+ min & 0.035896 & 0.729217 & 0.008990 & 0.009167 \\
9
+ 25\% & 0.043558 & 0.775807 & 0.010590 & 0.011279 \\
10
+ 50\% & 0.046970 & 0.795175 & 0.011699 & 0.012060 \\
11
+ 75\% & 0.056887 & 0.838984 & 0.016232 & 0.013151 \\
12
+ max & 0.303843 & 1.465686 & 0.026489 & 0.020290 \\
13
+ \bottomrule
14
+ \end{tabular}
test.py DELETED
@@ -1,20 +0,0 @@
1
- # %%
2
- from datasets import load_dataset
3
- from src.retrievers.faiss_retriever import FaissRetriever
4
-
5
-
6
- data = load_dataset("GroNLP/ik-nlp-22_slp", "paragraphs")
7
-
8
- # # %%
9
- # x = data["test"][:3]
10
-
11
- # # %%
12
- # for y in x:
13
-
14
- # print(y)
15
- # # %%
16
- # x.num_rows
17
-
18
- # # %%
19
- retriever = FaissRetriever(data)
20
- scores, result = retriever.retrieve("hello world")