Joan Giner commited on
Commit
cfbb0ad
·
1 Parent(s): 07cb2d5

New version

Browse files
Files changed (2) hide show
  1. app.py +93 -468
  2. src/extractor.py +619 -0
app.py CHANGED
@@ -17,469 +17,68 @@ import asyncio
17
  from transformers import pipeline
18
  from dotenv import load_dotenv
19
  import json
 
20
  load_dotenv()
21
 
22
  ## You api key from vendors or hugginface
23
  openai.api_key=os.getenv("OPEN_AI_API_KEY")
24
  LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
25
-
26
- # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
27
- def extract_text_from_pdf(file_path):
28
- article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
29
- print("parsed")
30
- #source = article_dict.find("sourcedesc")
31
- #authors = source.find_all("persname")
32
- finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
33
- sections = []
34
- for section in article_dict['sections']:
35
- sec = section['heading'] + ": "
36
- if(isinstance(section['text'], str)):
37
- finaltext = finaltext + sec + section['text'] + " \n\n "
38
- else:
39
- for text in section['text']:
40
- sec = sec + text+ " \n\n "
41
- finaltext = finaltext + sec
42
- return finaltext
43
-
44
- # Extract and transform the tables of the papers
45
- async def get_tables(docsearch,chain_table,input_file):
46
- print("Getting tables")
47
- table_texts = []
48
- dfs = tabula.read_pdf(input_file.name, pages='all')
49
- for idx, table in enumerate(dfs):
50
- query = "Table "+str(idx+1)+":"
51
- docs = docsearch.similarity_search(query, k=4)
52
- #result = chain_table({"context":docs,"table":table})
53
- table_texts.append(async_table_generate(docs, table, chain_table))
54
- #print(query + " "+ result['text'])
55
- #table_texts.append(query + " "+ result['text'])
56
- table_texts = await asyncio.gather(*table_texts)
57
- for table in table_texts:
58
- docsearch.add_texts(table[1])
59
- return docsearch
60
-
61
- def extract_text_clean(file_path):
62
- file_extension = os.path.splitext(file_path.name)[1]
63
- if file_extension == ".pdf":
64
- all_text = extract_text_from_pdf(file_path.name)
65
- return all_text
66
- elif file_extension == ".txt":
67
- with open(file_path.name) as f:
68
- all_text = f.read()
69
- return all_text
70
-
71
- async def prepare_data(input_file, chain_table, apikey):
72
- #with open(input_file.name) as f:
73
- # documentation = f.read()
74
- file_name = input_file.name.split("/")[-1]
75
-
76
-
77
- # Process text and get the embeddings
78
- filepath = "./vectors/"+file_name
79
- if not apikey:
80
- apikey = openai.api_key
81
- gr.Error("Please set your api key")
82
- embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
83
- if os.path.isfile(filepath+"/index.faiss"):
84
-
85
- # file exists
86
- docsearch = FAISS.load_local(filepath,embeddings=embeddings)
87
-
88
- print("We get the embeddings from local store")
89
- else:
90
- #progress(0.40, desc="Detected new document. Splitting and generating the embeddings")
91
- print("We generate the embeddings using thir-party service")
92
- # Get extracted running text
93
- text = extract_text_clean(input_file)
94
-
95
- # Configure the text splitter and embeddings
96
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=10, separators=[".", ",", " \n\n "])
97
-
98
- # Split, and clean
99
- texts = text_splitter.split_text(text)
100
- for idx, text in enumerate(texts):
101
- texts[idx] = text.replace('\n',' ')
102
- print("Creating embeddings")
103
- # Create an index search
104
- docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])
105
-
106
- # Extract and prepare tables
107
- # progress(0.60, desc="Embeddings generated, parsing and transforming tables")
108
- if (os.path.splitext(input_file.name)[1] == '.pdf'):
109
- docsearch = await get_tables(docsearch,chain_table,input_file)
110
-
111
- # Save the index locally
112
- FAISS.save_local(docsearch, "./vectors/"+file_name)
113
-
114
- return docsearch
115
-
116
- def build_chains(apikey):
117
- if not apikey:
118
- apikey = openai.api_key
119
- gr.Error("Please set your api key")
120
- LLMClient = OpenAI(model_name='text-davinci-003',openai_api_key=apikey,temperature=0)
121
- ## In-context prompt
122
- prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
123
- Question: {question}
124
- ###
125
- Context:
126
- {context}
127
- ###
128
- Helpful answer:
129
- """
130
- in_context_prompt = PromptTemplate(
131
- input_variables=["context","question"],
132
- template=prompt_template,
133
- )
134
- chain_incontext = load_qa_chain(LLMClient, chain_type="stuff", prompt=in_context_prompt)
135
-
136
- # Table extraction prompts
137
- ## Table prompt to transform parsed tables in natural text
138
- prompt_template = """Given the following table in HTML, and the given context related the table: Translate the content of the table into natural language.
139
- ###
140
- Context:
141
- {context}
142
- ###
143
- Table: {table}
144
- ###
145
- Table translation:
146
- """
147
- table_prompt = PromptTemplate(
148
- input_variables=["context","table"],
149
- template=prompt_template,
150
- )
151
- chain_table = LLMChain(llm=LLMClient, prompt=table_prompt)
152
-
153
- return chain_incontext, chain_table
154
-
155
- async def async_table_generate(docs,table,chain):
156
-
157
- resp = await chain.arun({"context": docs, "table": table})
158
- #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
159
- return resp
160
-
161
- async def async_generate(dimension, docs,question,chain):
162
- resp = await chain.arun({"input_documents": docs, "question": question})
163
- #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
164
- return [dimension, resp]
165
-
166
- async def get_gathering_dimension(docsearch, incontext_prompt, retrieved_docs):
167
- dimensions = [
168
- {"Gathering description":"""Provide a summary of how the data of the dataset has been collected? Please avoid mention the annotation process or data preparation processes"""},
169
- {"Gathering type":"""Which of the following types corresponds to the gathering process mentioned in the context?
170
-
171
- Types: Web API, Web Scrapping, Sensors, Manual Human Curator, Software collection, Surveys, Observations, Interviews, Focus groups, Document analysis, Secondary data analysis, Physical data collection, Self-reporting, Experiments, Direct measurement, Interviews, Document analysis, Secondary data analysis, Physical data collection, Self-reporting, Experiments, Direct measurement, Customer feedback data, Audio or video recordings, Image data, Biometric data, Medical or health data, Financial data, Geographic or spatial data, Time series data, User-generated content data.
172
-
173
- Answer with "Others", if you are unsure. Please answer with only the type"""},
174
- {"Gathering team": """Who was the team who collect the data?"""},
175
- {"Team Type": """The data was collected by an internal team, an external team, or crowdsourcing team?""" },
176
- {"Team Demographics": "Are the any demographic information of team gathering the data?"},
177
- {"Timeframe ":""" Which are the timeframe when the data was collected?
178
- If present, answer only with the collection timeframe of the data. If your are not sure, or there is no mention, just answers 'not provided'"""},
179
- {"Sources": """Which is the source of the data during the collection process? Answer solely with the name of the source""" },
180
- {"Infrastructure": """Which tools or infrastructure has been used during the collection process?"""},
181
- {"Localization": """Which are the places where data has been collected?
182
- If present, answer only with the collection timeframe of the data. If your are not sure, or there is no mention, just answers 'not provided'"""}
183
-
184
- ]
185
-
186
- results = []
187
- for dimension in dimensions:
188
- for title, question in dimension.items():
189
- docs = docsearch.similarity_search(question, k=retrieved_docs)
190
- results.append(async_generate(title, docs,question,incontext_prompt))
191
-
192
- answers = await asyncio.gather(*results)
193
- return answers
194
-
195
- async def get_annotation_dimension(docsearch, incontext_prompt, retrieved_docs):
196
- dimensions = [
197
- {"Annotation description":"""How the data of the has been annotated or labelled? Provide a short summary of the annotation process"""},
198
- {"Annotation type":""" Which of the following category corresponds to the annotation
199
- process mentioned in the context?
200
-
201
- Categories: Bounding boxes, Lines and splines, Semantinc Segmentation, 3D cuboids, Polygonal segmentation, Landmark and key-point, Image and video annotations, Entity annotation, Content and textual categorization
202
-
203
- If you are not sure, answer with 'others'. Please answer only with the categories provided in the context. """},
204
- {"Labels":""" Which are the specific labels of the dataset? Can you enumerate it an provide a description of each one?"""},
205
- {"Team Description": """Who has annotated the data?"""},
206
- {"Team type": """The data was annotated by an internal team, an external team, or crowdsourcing team?""" },
207
- {"Team Demographics": """Is there any demographic information about the team who annotate the data?"""},
208
- {"Infrastructure": """Which tool has been used to annotate or label the dataset?"""},
209
- {"Validation": """How the quality of the labels have been validated?""" }
210
- ]
211
-
212
- results = []
213
- for dimension in dimensions:
214
- for title, question in dimension.items():
215
- docs = docsearch.similarity_search(question, k=retrieved_docs)
216
- results.append(async_generate(title, docs,question,incontext_prompt))
217
-
218
- answers = await asyncio.gather(*results)
219
- return answers
220
-
221
- async def get_social_concerns_dimension(docsearch, incontext_prompt, retrieved_docs):
222
- dimensions = [
223
- {"Representetiveness":"""Are there any social group that could be misrepresented in the dataset?"""},
224
- {"Biases":"""Is there any potential bias or imbalance in the data?"""},
225
- {"Sensitivity":""" Are there sensitive data, or data that can be offensive for people in the dataset?"""},
226
- {"Privacy":""" Is there any privacy issues on the data?"""},
227
-
228
- ]
229
-
230
- results = []
231
- for dimension in dimensions:
232
- for title, question in dimension.items():
233
- docs = docsearch.similarity_search(question, k=retrieved_docs)
234
- results.append(async_generate(title, docs,question,incontext_prompt))
235
-
236
- answers = await asyncio.gather(*results)
237
- return answers
238
-
239
- async def get_uses_dimension(docsearch, incontext_prompt, retrieved_docs):
240
- dimensions = [
241
- {"Purposes":"""Which are the purpose or purposes of the dataset?"""},
242
- {"Gaps":"""Which are the gaps the dataset intend to fill?"""},
243
- {"Task":"""Which machine learning tasks the dataset inteded for?:"""},
244
- {"Recommended":"""For which applications the dataset is recommended?"""},
245
- {"Non-Recommneded":"""Is there any non-recommneded application for the dataset? If you are not sure, or there is any non-recommended use of the dataset metioned in the context, just answer with "no"."""},
246
- ]
247
- results = []
248
- for dimension in dimensions:
249
- for title, question in dimension.items():
250
- docs = docsearch.similarity_search(question, k=retrieved_docs)
251
- if (title == "Task"):
252
- question = """Which of the following ML tasks for the dataset best matches the context?
253
-
254
- Tasks: text-classification, question-answering, text-generation, token-classification, translation,
255
- fill-mask, text-retrieval, conditional-text-generation, sequence-modeling, summarization, other,
256
- structure-prediction, information-retrieval, text2text-generation, zero-shot-retrieval,
257
- zero-shot-information-retrieval, automatic-speech-recognition, image-classification, speech-processing,
258
- text-scoring, audio-classification, conversational, question-generation, image-to-text, data-to-text,
259
- classification, object-detection, multiple-choice, text-mining, image-segmentation, dialog-response-generation,
260
- named-entity-recognition, sentiment-analysis, machine-translation, tabular-to-text, table-to-text, simplification,
261
- sentence-similarity, zero-shot-classification, visual-question-answering, text_classification, time-series-forecasting,
262
- computer-vision, feature-extraction, symbolic-regression, topic modeling, one liner summary, email subject, meeting title,
263
- text-to-structured, reasoning, paraphrasing, paraphrase, code-generation, tts, image-retrieval, image-captioning,
264
- language-modelling, video-captionning, neural-machine-translation, transkation, text-generation-other-common-sense-inference,
265
- text-generation-other-discourse-analysis, text-to-tabular, text-generation-other-code-modeling, other-text-search
266
-
267
- If you are not sure answer with just with "others".
268
- Please, answer only with one or some of the provided tasks """
269
-
270
- results.append(async_generate(title, docs,question,incontext_prompt))
271
-
272
- answers = await asyncio.gather(*results)
273
- return answers
274
-
275
- async def get_contributors_dimension(docsearch, incontext_prompt, retrieved_docs):
276
- dimensions = [
277
- {"Authors":"""Who are the authors of the dataset """},
278
- {"Funders":"""Is there any organization which supported or funded the creation of the dataset?"""},
279
- {"Maintainers":"""Who are the maintainers of the dataset?"""},
280
- {"Erratums":"""Is there any data retention limit in the dataset? If you are not sure, or there is no retention limit just answer with "no"."""},
281
- {"Data Retention Policies":"""Is there any data retention policies policiy of the dataset? If you are not sure, or there is no retention policy just answer with "no"."""},
282
- ]
283
-
284
- results = []
285
- for dimension in dimensions:
286
- for title, question in dimension.items():
287
- docs = docsearch.similarity_search(question, k=retrieved_docs)
288
- results.append(async_generate(title, docs,question,incontext_prompt))
289
-
290
- answers = await asyncio.gather(*results)
291
- return answers
292
-
293
- async def get_composition_dimension(docsearch, incontext_prompt, retrieved_docs):
294
- dimensions = [
295
- {"File composition":"""Can you provide a description of each files the dataset is composed of?"""},
296
- {"Attributes":"""Can you enumerate the different attributes present in the dataset? """},
297
- {"Trainig splits":"""The paper mentions any recommended data split of the dataset?"""},
298
- {"Relevant statistics":"""Are there relevant statistics or distributions of the dataset? """},
299
- ]
300
-
301
- results = []
302
- for dimension in dimensions:
303
- for title, question in dimension.items():
304
- docs = docsearch.similarity_search(question, k=retrieved_docs)
305
- results.append(async_generate(title, docs,question,incontext_prompt))
306
-
307
- answers = await asyncio.gather(*results)
308
- return answers
309
-
310
- async def get_distribution_dimension(docsearch, incontext_prompt, retrieved_docs):
311
- dimensions = [
312
- {"Data repository":"""Is there a link to the a repository containing the data? If you are not sure, or there is no link to the repository just answer with "no"."""},
313
- {"Licence":"""Which is the license of the dataset. If you are not sure, or there is mention to a license of the dataset in the context, just answer with "no". """},
314
- {"Deprecation policies":"""Is there any deprecation plan or policy of the dataset?
315
- """},
316
-
317
- ]
318
-
319
- results = []
320
- for dimension in dimensions:
321
- for title, question in dimension.items():
322
- docs = docsearch.similarity_search(question, k=retrieved_docs)
323
- results.append(async_generate(title, docs,question,incontext_prompt))
324
-
325
- answers = await asyncio.gather(*results)
326
- return answers
327
-
328
- def get_warnings(results):
329
- warnings = []
330
- classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1")
331
- for result in results:
332
- if(result[0] == "Team Demographics"):
333
- classifications = classifier(result[1], ["Have demographics information","Do not have demographics information"])
334
- if(classifications['labels'][0] == 'Do not have demographics information'):
335
- print("Dimension: "+result[0]+" is missing. Inserting a warning")
336
- warnings.append(result[0]+" is missing. This information is relevant to evaluate the quality of the labels")
337
- if(result[0] == "Localization"):
338
- classifications = classifier(result[1], ["Is a localization","Is not a localization"])
339
- if(classifications['labels'][0] == 'Is not a localization'):
340
- print("Dimension: "+result[0]+" is missing. Inserting a warning")
341
- warnings.append(result[0]+" is missing. Please indicate where the data has been collected")
342
- if(result[0] == "Time Localization"):
343
- classifications = classifier(result[1], ["It is a date","It is not a date"])
344
- if(classifications['labels'][0] == 'Is not a localization'):
345
- print("Dimension: "+result[0]+" is missing. Inserting a warning")
346
- warnings.append(result[0]+" is missing. Please indicate when the data has been collected")
347
- if len(warnings) == 0:
348
- warnings.append("No warnings")
349
- return warnings
350
 
351
  # Define function to handle the Gradio interface
352
- async def annotate_only(input_file, apikey):
353
- #progress(0, desc="Starting")
354
- # Build the chains
355
- chain_incontext, chain_table = build_chains(apikey)
356
- # Prepare the data
357
- #progress(0.20, desc="Preparing data: Generating embeddings, splitting text, and adding transformed tables")
358
- docsearch = await prepare_data(input_file, chain_table, apikey)
359
- # Get annotation dimensions
360
- #progress(0.40, desc="Extracting dimensions")
361
- results = await get_annotation_dimension(docsearch,chain_incontext, retrieved_docs=10)
362
- # Get warning
363
- #progress(0.80, desc="Generating Warning")
364
- warnings = get_warnings(results)
365
-
366
- # Build results in the correct format for the Gradio front-end
367
- results = pd.DataFrame(results, columns=['Dimension', 'Results'])
368
- return results, gr.update(value=pd.DataFrame(warnings,columns=['Warnings:']), visible=True)
369
- # Define function to handle the Gradio interface
370
- async def uses_only(input_file, apikey):
371
- # Build the chains
372
- chain_incontext, chain_table = build_chains(apikey)
373
- # Prepare the data
374
- docsearch = await prepare_data(input_file, chain_table, apikey)
375
- # Get annotation dimensions
376
- results = await get_uses_dimension(docsearch,chain_incontext, retrieved_docs=10)
377
- # Get warning
378
- warnings = get_warnings(results)
379
-
380
- # Build results in the correct format for the Gradio front-end
381
- results = pd.DataFrame(results, columns=['Dimension', 'Results'])
382
- return results, gr.update(value=pd.DataFrame(warnings,columns=['Warnings:']), visible=True)
383
-
384
- # Define function to handle the Gradio interface
385
- async def distribution_only(input_file, apikey):
386
- # Build the chains
387
- chain_incontext, chain_table = build_chains(apikey)
388
- # Prepare the data
389
- docsearch = await prepare_data(input_file, chain_table, apikey)
390
- # Get annotation dimensions
391
- results = await get_distribution_dimension(docsearch,chain_incontext, retrieved_docs=10)
392
- # Get warning
393
- warnings = get_warnings(results)
394
-
395
- # Build results in the correct format for the Gradio front-end
396
- results = pd.DataFrame(results, columns=['Dimension', 'Results'])
397
- return results, gr.update(value=pd.DataFrame(warnings,columns=['Warnings:']), visible=True)
398
- # Define function to handle the Gradio interface
399
- async def social_cocerns_only(input_file, apikey):
400
- # Build the chains
401
- chain_incontext, chain_table = build_chains(apikey)
402
- # Prepare the data
403
- docsearch = await prepare_data(input_file, chain_table, apikey)
404
- # Get annotation dimensions
405
- results = await get_social_concerns_dimension(docsearch,chain_incontext, retrieved_docs=10)
406
- # Get warning
407
- warnings = get_warnings(results)
408
-
409
- # Build results in the correct format for the Gradio front-end
410
- results = pd.DataFrame(results, columns=['Dimension', 'Results'])
411
- return results, gr.update(value=pd.DataFrame(warnings,columns=['Warnings:']), visible=True)
412
- # Define function to handle the Gradio interface
413
- async def composition_only(input_file, apikey):
414
- # Build the chains
415
- chain_incontext, chain_table = build_chains(apikey)
416
- # Prepare the data
417
- docsearch = await prepare_data(input_file, chain_table, apikey)
418
- # Get annotation dimensions
419
- results = await get_composition_dimension(docsearch,chain_incontext, retrieved_docs=10)
420
- # Get warning
421
- warnings = get_warnings(results)
422
-
423
- # Build results in the correct format for the Gradio front-end
424
- results = pd.DataFrame(results, columns=['Dimension', 'Results'])
425
- return results, gr.update(value=pd.DataFrame(warnings,columns=['Warnings:']), visible=True)
426
- # Define function to handle the Gradio interface
427
- async def contributors_only(input_file, apikey):
428
  # Build the chains
429
- chain_incontext, chain_table = build_chains(apikey)
430
  # Prepare the data
431
- docsearch = await prepare_data(input_file, chain_table, apikey)
432
- # Get annotation dimensions
433
- results = await get_contributors_dimension(docsearch,chain_incontext, retrieved_docs=10)
434
- # Get warning
435
- warnings = get_warnings(results)
436
-
437
- # Build results in the correct format for the Gradio front-end
438
- results = pd.DataFrame(results, columns=['Dimension', 'Results'])
439
- return results, gr.update(value=pd.DataFrame(warnings,columns=['Warnings:']), visible=True)
440
-
441
- async def gather_only(input_file, apikey):
442
- # Build the chains
443
- chain_incontext, chain_table = build_chains(apikey)
444
- # Prepare the data
445
- docsearch = await prepare_data(input_file, chain_table, apikey)
446
- # Get annotation dimensions
447
- results = await get_gathering_dimension(docsearch,chain_incontext, retrieved_docs=10)
448
- # Get warning
449
- warnings = get_warnings(results)
450
- results = pd.DataFrame(results, columns=['Dimension', 'Results'])
451
- return results, gr.update(value=pd.DataFrame(warnings, columns=['Warnings:']), visible=True)
 
 
 
 
 
452
 
453
  async def complete(input_file):
454
-
455
  # Build the chains
456
- chain_incontext, chain_table = build_chains(apikey=os.getenv("OPEN_AI_API_KEY"))
457
  # Prepare the data
458
- docsearch = await prepare_data(input_file, chain_table, apikey=os.getenv("OPEN_AI_API_KEY"))
459
  #Retrieve dimensions
460
- results = await asyncio.gather(get_annotation_dimension(docsearch,chain_incontext, retrieved_docs=10),
461
- get_gathering_dimension(docsearch,chain_incontext, retrieved_docs=10),
462
- get_uses_dimension(docsearch,chain_incontext, retrieved_docs=10),
463
- get_contributors_dimension(docsearch,chain_incontext, retrieved_docs=10),
464
- get_composition_dimension(docsearch,chain_incontext, retrieved_docs=10),
465
- get_social_concerns_dimension(docsearch,chain_incontext, retrieved_docs=10),
466
- get_distribution_dimension(docsearch,chain_incontext, retrieved_docs=10))
467
- # Get warning from the results
468
  warnings = []
 
469
  for result in results:
470
- warnings.append(gr.update(value=[get_warnings(result)], visible=True))
471
- #warnings_dt = gr.update(value=pd.DataFrame(warnings,columns=['Warnings:'],labels= None), visible=True)
472
- results.extend(warnings)
473
- return results
474
-
475
- async def annotation_api_wrapper(input_file, apikey):
476
-
477
- results, alarms = await annotate_only(input_file, apikey)
478
-
479
- response_results = results.to_dict()
480
- response_alarms = alarms['value'].to_dict()
481
- api_answer = {'results':response_results, 'warnings': response_alarms}
482
- return api_answer
483
 
484
  ## Building the layout of the app
485
  css = """.table-wrap.scroll-hide.svelte-8hrj8a.no-wrap {
@@ -530,28 +129,32 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
530
  with gr.Row():
531
  gr.Markdown("## DataDoc Analyzer")
532
  with gr.Row():
533
- gr.Markdown("""Extract, in a structured manner, the **[general guidelines](https://knowingmachines.org/reading-list#dataset_documentation_practices)** from the ML community about dataset documentation practices from its scientific documentation. Study and analyze scientific data published in peer-review journals such as: **[Nature's Scientific Data](https://duckduckgo.com)** and **[Data-in-Brief](https://duckduckgo.com)**. Here you have a **[complete list](https://zenodo.org/record/7082126#.ZDaf-OxBz0p)** of data journals suitable to be analyzed with this tool.
534
  """)
535
 
536
  with gr.Row():
537
 
538
  with gr.Column():
539
- fileinput = gr.File(label="Upload TXT file"),
540
 
541
  with gr.Column():
542
  gr.Markdown(""" <h4 style=text-align:center>Instructions: </h4>
543
 
544
- <b> &#10549; Try the examples </b> at the bottom
 
 
545
 
546
- <b> &#8680; Set your API KEY </b> of OpenAI
 
547
 
548
  <b> &#8678; Upload </b> your data paper (in PDF or TXT)
549
 
550
  <b> &#8681; Click in get insights </b> in one tab!
551
 
 
552
  """)
553
  with gr.Column():
554
- apikey_elem = gr.Text(label="Your OpenAI APIKey")
555
  # gr.Markdown("""
556
  # <h3> Improving your data and assesing your dataset documentation </h3>
557
  # The generated warning also allows you quicly check the completeness of the documentation, and spotting gaps in the document
@@ -560,40 +163,40 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
560
  with gr.Row():
561
  with gr.Tab("Annotation"):
562
 
563
- gr.Markdown("""In this chapter, we get information regarding the annotation process of the data: We provide a description of the process and we infer its type from the documentation. Then we extract the labels, and information about the annotation team, the infrastructure used to annotate the data and the validation process applied over the labels""")
564
  result_anot = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
565
  alerts_anot = gr.DataFrame(headers=["warnings"],type="array", visible=False)
566
  button_annotation = gr.Button("Get the annotation process insights!")
567
 
568
  with gr.Tab("Gathering"):
569
- gr.Markdown("""In this chapter, we get information regarding the collection process of the data: We provide a description of the process and we infer its type from the documentation. Then we extract information about the collection team, the infrastructure used to collect the data and the sources. Also we get the timeframe of the data collection and its geolocalization.""")
570
  result_gather = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
571
  alerts_gather = gr.DataFrame(headers=["warnings"],type="array", visible=False)
572
  button_gathering = gr.Button("Get the gathering process insights!")
573
  with gr.Tab("Uses"):
574
- gr.Markdown("""In this chapter, we extract the design intentios of the authors, we extract the purposes, gaps, and we infer the ML tasks (extracted form hugginface) the dataset is inteded for. Also we get the uses recomendation and the ML Benchmarks if the dataset have been tested with them""")
575
  result_uses = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
576
  alerts_uses = gr.DataFrame(headers=["warnings"],type="array", visible=False)
577
  button_uses = gr.Button("Get the uses of the dataset!")
578
  with gr.Tab("Contributors"):
579
- gr.Markdown("""In this chapter, we extract all the contributors, funding information and maintenance of the dataset""")
580
  result_contrib = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
581
  alerts_contrib = gr.DataFrame(headers=["warnings"],type="array", visible=False)
582
  button_contrib = gr.Button("Get the contributors of the dataset!")
583
 
584
  with gr.Tab("Composition"):
585
- gr.Markdown("""In this chapter, we extract the file structure, we identify the attributes of the dataset, the recommneded trainig splits and the relevant statistics (if provided in the documentation) """)
586
  result_comp = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
587
  alerts_comp = gr.DataFrame(headers=["warnings"],type="array", visible=False)
588
  button_comp = gr.Button("Get the composition of the dataset!")
589
  with gr.Tab("Social Concerns"):
590
- gr.Markdown("""In this chapter, we extract social concerns regarding the representativeness of social groups, potential biases, sensitivity issues, and privacy issues. """)
591
  result_social = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
592
  alerts_social = gr.DataFrame(headers=["warnings"],type="array", visible=False)
593
  button_social = gr.Button("Get the Social Cocerns!")
594
 
595
  with gr.Tab("Distribution"):
596
- gr.Markdown("""In this chapter, we aim to extract the legal conditions under the dataset is released) """)
597
  result_distri = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
598
  alerts_distribution = gr.DataFrame(headers=["warning"],type="array", visible=False)
599
  button_dist = gr.Button("Get the Distribution!")
@@ -621,20 +224,42 @@ with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
621
  button_complete = gr.Button("Get all the dimensions", visible=False)
622
  allres = gr.Text(visible=False)
623
  ## Events of the apps
624
- button_annotation.click(annotate_only,inputs=[fileinput[0],apikey_elem ],outputs=[result_anot,alerts_anot])
625
- button_gathering.click(gather_only,inputs=[fileinput[0],apikey_elem ],outputs=[result_gather,alerts_gather])
626
- button_uses.click(uses_only,inputs=[fileinput[0],apikey_elem ],outputs=[result_uses,alerts_uses])
627
- button_contrib.click(contributors_only,inputs=[fileinput[0],apikey_elem ],outputs=[result_contrib,alerts_contrib])
628
- button_comp.click(composition_only,inputs=[fileinput[0],apikey_elem ],outputs=[result_comp,alerts_comp])
629
- button_social.click(social_cocerns_only,inputs=[fileinput[0],apikey_elem ],outputs=[result_social,alerts_social])
630
- button_dist.click(distribution_only,inputs=[fileinput[0],apikey_elem ],outputs=[result_distri,alerts_distribution])
631
 
632
 
633
  ## API endpoints
634
- button_complete.click(annotation_api_wrapper,inputs=[fileinput[0],apikey_elem],outputs=allres, api_name="our")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
 
636
 
637
  # Run the app
638
  #demo.queue(concurrency_count=5,max_size=20).launch()
639
- demo.launch(auth=("CKIM2023", "demodemo"))
640
 
 
17
  from transformers import pipeline
18
  from dotenv import load_dotenv
19
  import json
20
+ from src.extractor import Extractor
21
  load_dotenv()
22
 
23
  ## You api key from vendors or hugginface
24
  openai.api_key=os.getenv("OPEN_AI_API_KEY")
25
  LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
26
+ extractor = Extractor()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # Define function to handle the Gradio interface
29
+ async def extraction(input_file, apikey, dimension):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Build the chains
31
+ chain_incontext, chain_table = extractor.build_chains(apikey)
32
  # Prepare the data
33
+ docsearch = await extractor.prepare_data(input_file, chain_table, apikey)
34
+ # Extract dimensions
35
+ if (dimension == "annotation"):
36
+ results, completeness_report = await extractor.get_annotation_dimension(docsearch,chain_incontext, retrieved_docs=10)
37
+ elif (dimension == "gathering"):
38
+ results, completeness_report = await extractor.get_gathering_dimension(docsearch,chain_incontext, retrieved_docs=10)
39
+ elif (dimension == "uses"):
40
+ results, completeness_report = await extractor.get_uses_dimension(docsearch,chain_incontext, retrieved_docs=10)
41
+ elif (dimension == "contrib"):
42
+ results, completeness_report = await extractor.get_contributors_dimension(docsearch,chain_incontext, retrieved_docs=10)
43
+ elif (dimension == "comp"):
44
+ results, completeness_report = await extractor.get_composition_dimension(docsearch,chain_incontext, retrieved_docs=10)
45
+ elif (dimension == "social"):
46
+ results, completeness_report = await extractor.get_social_concerns_dimension(docsearch,chain_incontext, retrieved_docs=10)
47
+ elif (dimension == "dist"):
48
+ results, completeness_report = await extractor.get_distribution_dimension(docsearch,chain_incontext, retrieved_docs=10)
49
+ # Get completeness report
50
+ #completeness_report = extractor.postprocessing(results)
51
+ return results, completeness_report
52
+
53
+ async def ui_extraction(input_file, apikey, dimension):
54
+ file_name = input_file.name.split("/")[-1]
55
+ results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
56
+ # Build results in the correct format for the Gradio front-end
57
+ results = pd.DataFrame(results, columns=['Dimension', 'Results'])
58
+ return results, gr.update(value=pd.DataFrame(completeness_report['report'],columns=['Completeness report: '+str(completeness_report['completeness'])+'%']), visible=True)
59
 
60
  async def complete(input_file):
61
+ file_name = input_file.name.split("/")[-1]
62
  # Build the chains
63
+ chain_incontext, chain_table = extractor.build_chains(apikey=os.getenv("OPEN_AI_API_KEY"))
64
  # Prepare the data
65
+ docsearch = await extractor.prepare_data(file_name, input_file.name, chain_table, apikey=os.getenv("OPEN_AI_API_KEY"))
66
  #Retrieve dimensions
67
+ results = await asyncio.gather(extractor.get_annotation_dimension(docsearch,chain_incontext, retrieved_docs=10),
68
+ extractor.get_gathering_dimension(docsearch,chain_incontext, retrieved_docs=10),
69
+ extractor.get_uses_dimension(docsearch,chain_incontext, retrieved_docs=10),
70
+ extractor.get_contributors_dimension(docsearch,chain_incontext, retrieved_docs=10),
71
+ extractor.get_composition_dimension(docsearch,chain_incontext, retrieved_docs=10),
72
+ extractor.get_social_concerns_dimension(docsearch,chain_incontext, retrieved_docs=10),
73
+ extractor.get_distribution_dimension(docsearch,chain_incontext, retrieved_docs=10))
74
+ # Get completeness report from the results
75
  warnings = []
76
+ extracts = []
77
  for result in results:
78
+ extracts.append(result[0])
79
+ warnings.append(gr.update(value=pd.DataFrame(result[1]['report'],columns=['Completeness report: '+str(result[1]['completeness'])+'%']), visible=True))
80
+ extracts.extend(warnings)
81
+ return extracts
 
 
 
 
 
 
 
 
 
82
 
83
  ## Building the layout of the app
84
  css = """.table-wrap.scroll-hide.svelte-8hrj8a.no-wrap {
 
129
  with gr.Row():
130
  gr.Markdown("## DataDoc Analyzer")
131
  with gr.Row():
132
+ gr.Markdown("""Extract, in a structured manner, the **[general guidelines](https://knowingmachines.org/reading-list#dataset_documentation_practices)** from the ML community about dataset documentation practices from its scientific documentation. Study and analyze scientific data published in peer-review journals such as: **[Nature's Scientific Data](https://www.nature.com/sdata/)** and **[Data-in-Brief](https://www.data-in-brief.com)**. Here you have a **[complete list](https://zenodo.org/record/7082126#.ZDaf-OxBz0p)** of data journals suitable to be analyzed with this tool.
133
  """)
134
 
135
  with gr.Row():
136
 
137
  with gr.Column():
138
+ fileinput = gr.File(label="Upload the dataset documentation"),
139
 
140
  with gr.Column():
141
  gr.Markdown(""" <h4 style=text-align:center>Instructions: </h4>
142
 
143
+ <b> &#10549; Try the examples </b> at the bottom
144
+
145
+ <b> then </b>
146
 
147
+
148
+ <b> &#8680; Set your API key </b> of OpenAI
149
 
150
  <b> &#8678; Upload </b> your data paper (in PDF or TXT)
151
 
152
  <b> &#8681; Click in get insights </b> in one tab!
153
 
154
+
155
  """)
156
  with gr.Column():
157
+ apikey_elem = gr.Text(label="OpenAI API key (Not needed during review)")
158
  # gr.Markdown("""
159
  # <h3> Improving your data and assesing your dataset documentation </h3>
160
  # The generated warning also allows you quicly check the completeness of the documentation, and spotting gaps in the document
 
163
  with gr.Row():
164
  with gr.Tab("Annotation"):
165
 
166
+ gr.Markdown("""In this dimension, you can get information regarding the annotation process of the data: Extract a description of the process and infer its type. Extract the labels and information about the annotation team, the infrastructure used to annotate the data, and the validation process applied to the labels.""")
167
  result_anot = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
168
  alerts_anot = gr.DataFrame(headers=["warnings"],type="array", visible=False)
169
  button_annotation = gr.Button("Get the annotation process insights!")
170
 
171
  with gr.Tab("Gathering"):
172
+ gr.Markdown("""In this dimension, we get information regarding the collection process of the data: We provide a description of the process and we infer its type from the documentation. Then we extract information about the collection team, the infrastructure used to collect the data and the sources. Also we get the timeframe of the data collection and its geolocalization.""")
173
  result_gather = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
174
  alerts_gather = gr.DataFrame(headers=["warnings"],type="array", visible=False)
175
  button_gathering = gr.Button("Get the gathering process insights!")
176
  with gr.Tab("Uses"):
177
+ gr.Markdown("""In this dimension, we extract the design intentios of the authors, we extract the purposes, gaps, and we infer the ML tasks (extracted form hugginface) the dataset is inteded for. Also we get the uses recomendation and the ML Benchmarks if the dataset have been tested with them""")
178
  result_uses = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
179
  alerts_uses = gr.DataFrame(headers=["warnings"],type="array", visible=False)
180
  button_uses = gr.Button("Get the uses of the dataset!")
181
  with gr.Tab("Contributors"):
182
+ gr.Markdown("""In this dimension, we extract all the contributors, funding information and maintenance of the dataset""")
183
  result_contrib = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
184
  alerts_contrib = gr.DataFrame(headers=["warnings"],type="array", visible=False)
185
  button_contrib = gr.Button("Get the contributors of the dataset!")
186
 
187
  with gr.Tab("Composition"):
188
+ gr.Markdown("""In this dimension, we extract the file structure, we identify the attributes of the dataset, the recommneded trainig splits and the relevant statistics (if provided in the documentation) """)
189
  result_comp = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
190
  alerts_comp = gr.DataFrame(headers=["warnings"],type="array", visible=False)
191
  button_comp = gr.Button("Get the composition of the dataset!")
192
  with gr.Tab("Social Concerns"):
193
+ gr.Markdown("""In this dimension, we extract social concerns regarding the representativeness of social groups, potential biases, sensitivity issues, and privacy issues. """)
194
  result_social = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
195
  alerts_social = gr.DataFrame(headers=["warnings"],type="array", visible=False)
196
  button_social = gr.Button("Get the Social Cocerns!")
197
 
198
  with gr.Tab("Distribution"):
199
+ gr.Markdown("""In this dimension, we aim to extract the legal conditions under the dataset is released) """)
200
  result_distri = gr.DataFrame(headers=["dimension","result"],type="array",label="Results of the extraction:")
201
  alerts_distribution = gr.DataFrame(headers=["warning"],type="array", visible=False)
202
  button_dist = gr.Button("Get the Distribution!")
 
224
  button_complete = gr.Button("Get all the dimensions", visible=False)
225
  allres = gr.Text(visible=False)
226
  ## Events of the apps
227
+ button_annotation.click(ui_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="annotation")],outputs=[result_anot,alerts_anot])
228
+ button_gathering.click(ui_extraction,inputs=[fileinput[0],apikey_elem,gr.State("gathering") ],outputs=[result_gather,alerts_gather])
229
+ button_uses.click(ui_extraction,inputs=[fileinput[0],apikey_elem,gr.State("uses") ],outputs=[result_uses,alerts_uses])
230
+ button_contrib.click(ui_extraction,inputs=[fileinput[0],apikey_elem,gr.State("contrib") ],outputs=[result_contrib,alerts_contrib])
231
+ button_comp.click(ui_extraction,inputs=[fileinput[0],apikey_elem,gr.State("comp") ],outputs=[result_comp,alerts_comp])
232
+ button_social.click(ui_extraction,inputs=[fileinput[0],apikey_elem,gr.State("social") ],outputs=[result_social,alerts_social])
233
+ button_dist.click(ui_extraction,inputs=[fileinput[0],apikey_elem,gr.State("dist") ],outputs=[result_distri,alerts_distribution])
234
 
235
 
236
  ## API endpoints
237
+ #api_annotation = gr.Button(visible=False)
238
+ #api_annotation.click(api_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="annotation")],outputs=[result_anot,alerts_anot], api_name="annotation")
239
+ #api_gathering = gr.Button(visible=False)
240
+ #api_gathering.click(api_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="gathering")],outputs=[result_anot,alerts_anot], api_name="gathering")
241
+ #api_uses = gr.Button(visible=False)
242
+ #api_uses.click(api_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="uses")],outputs=[result_anot,alerts_anot], api_name="uses")
243
+ # api_contrib = gr.Button(visible=False)
244
+ # api_contrib.click(api_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="contrib")],outputs=[result_anot,alerts_anot], api_name="contrib")
245
+ #api_comp = gr.Button(visible=False)
246
+ #api_comp.click(api_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="comp")],outputs=[result_anot,alerts_anot], api_name="composition")
247
+ #api_social = gr.Button(visible=False)
248
+ #api_social.click(api_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="social")],outputs=[result_anot,alerts_anot], api_name="social")
249
+ #api_dist = gr.Button(visible=False)
250
+ #api_dist.click(api_extraction,inputs=[fileinput[0],apikey_elem,gr.State(value="dist")],outputs=[result_anot,alerts_anot], api_name="dist")
251
+
252
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
253
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
254
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
255
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
256
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
257
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
258
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
259
+ #button_complete.click(api_extraction,inputs=[fileinput[0],apikey_elem,"annotation"],outputs=allres, api_name="annotation")
260
 
261
 
262
  # Run the app
263
  #demo.queue(concurrency_count=5,max_size=20).launch()
264
+ demo.launch(share=False,show_api=False,auth=("CIKM2023", "demodemo"))
265
 
src/extractor.py ADDED
@@ -0,0 +1,619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import gradio as gr
3
+ from langchain.embeddings import OpenAIEmbeddings
4
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores.faiss import FAISS
6
+ from langchain.chains.question_answering import load_qa_chain
7
+ from langchain.chains import LLMChain
8
+ from langchain.llms import OpenAI
9
+ from langchain import PromptTemplate
10
+ from langchain.docstore.document import Document
11
+ import pandas as pd
12
+ import os
13
+ import scipdf ## You need a Gorbid service available
14
+ import tabula ## You need to have the Java Tabula installed in the environment
15
+ from gradio import DataFrame
16
+ import asyncio
17
+ from transformers import pipeline
18
+ from dotenv import load_dotenv
19
+
20
+
21
+ class Extractor:
22
+ def __init__(self):
23
+ print("Initializing extractor")
24
+ # Init classifier for the post-processing stage
25
+ self.classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1")
26
+
27
+ async def extraction(self, file_name, file_path, apikey, dimension):
28
+ # Build the chains
29
+ chain_incontext, chain_table = self.build_chains(apikey)
30
+ # Prepare the data
31
+ docsearch = await self.prepare_data(file_name, file_path, chain_table, apikey)
32
+ # Extract dimensions
33
+ if (dimension == "annotation"):
34
+ results, completeness_report = await self.get_annotation_dimension(docsearch,chain_incontext, retrieved_docs=10)
35
+ elif (dimension == "gathering"):
36
+ results, completeness_report = await self.get_gathering_dimension(docsearch,chain_incontext, retrieved_docs=10)
37
+ elif (dimension == "uses"):
38
+ results, completeness_report = await self.get_uses_dimension(docsearch,chain_incontext, retrieved_docs=10)
39
+ elif (dimension == "contrib"):
40
+ results, completeness_report = await self.get_contributors_dimension(docsearch,chain_incontext, retrieved_docs=10)
41
+ elif (dimension == "comp"):
42
+ results, completeness_report = await self.get_composition_dimension(docsearch,chain_incontext, retrieved_docs=10)
43
+ elif (dimension == "social"):
44
+ results, completeness_report = await self.get_social_concerns_dimension(docsearch,chain_incontext, retrieved_docs=10)
45
+ elif (dimension == "dist"):
46
+ results, completeness_report = await self.get_distribution_dimension(docsearch,chain_incontext, retrieved_docs=10)
47
+ # Get completeness report
48
+ #completeness_report = extractor.postprocessing(results)
49
+ return results, completeness_report
50
+
51
+ async def complete_extraction(self, file_name, file_path, apikey):
52
+ # Build the chains
53
+ chain_incontext, chain_table = self.build_chains(apikey=apikey)
54
+ # Prepare the data
55
+ docsearch = await self.prepare_data(file_name, file_path, chain_table, apikey=os.getenv("OPEN_AI_API_KEY"))
56
+ #Retrieve dimensions
57
+ results = await asyncio.gather(self.get_annotation_dimension(docsearch,chain_incontext, retrieved_docs=10),
58
+ self.get_gathering_dimension(docsearch,chain_incontext, retrieved_docs=10),
59
+ self.get_uses_dimension(docsearch,chain_incontext, retrieved_docs=10),
60
+ self.get_contributors_dimension(docsearch,chain_incontext, retrieved_docs=10),
61
+ self.get_composition_dimension(docsearch,chain_incontext, retrieved_docs=10),
62
+ self.get_social_concerns_dimension(docsearch,chain_incontext, retrieved_docs=10),
63
+ self.get_distribution_dimension(docsearch,chain_incontext, retrieved_docs=10))
64
+ return results
65
+
66
+ # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
67
+ def extract_text_from_pdf(self, file_path):
68
+ article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
69
+ print("PDF parsed")
70
+ finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
71
+ for section in article_dict['sections']:
72
+ sec = section['heading'] + ": "
73
+ if(isinstance(section['text'], str)):
74
+ finaltext = finaltext + sec + section['text'] + " \n\n "
75
+ else:
76
+ for text in section['text']:
77
+ sec = sec + text+ " \n\n "
78
+ finaltext = finaltext + sec
79
+ return finaltext
80
+
81
+ # Extract and transform the tables of the papers
82
+ async def get_tables(self, docsearch,chain_table,input_file):
83
+ print("Getting tables")
84
+ table_texts = []
85
+ dfs = tabula.read_pdf(input_file, pages='all')
86
+ for idx, table in enumerate(dfs):
87
+ query = "Table "+str(idx+1)+":"
88
+ docs = docsearch.similarity_search(query, k=4)
89
+ #result = chain_table({"context":docs,"table":table})
90
+ table_texts.append(self.async_table_generate(docs, table, chain_table))
91
+ #print(query + " "+ result['text'])
92
+ #table_texts.append(query + " "+ result['text'])
93
+ table_texts = await asyncio.gather(*table_texts)
94
+ for table in table_texts:
95
+ docsearch.add_texts(table[1])
96
+ return docsearch
97
+
98
+ def extract_text_clean(self, file_name, file_path):
99
+ file_extension = os.path.splitext(file_name)[1]
100
+ if file_extension == ".pdf":
101
+ all_text = self.extract_text_from_pdf(file_path)
102
+ return all_text
103
+ elif file_extension == ".txt":
104
+ with open(file_path) as f:
105
+ all_text = f.read()
106
+ return all_text
107
+
108
+ async def prepare_data(self, file_name, file_path, chain_table, apikey):
109
+ # Process text and get the embeddings
110
+ vectorspath = "./vectors/"+file_name
111
+ if not apikey:
112
+ apikey = openai.api_key
113
+ gr.Error("Please set your api key")
114
+ embeddings = OpenAIEmbeddings(openai_api_key=openai.api_key)
115
+ if os.path.isfile(vectorspath+"/index.faiss"):
116
+
117
+ # file exists
118
+ docsearch = FAISS.load_local(vectorspath,embeddings=embeddings)
119
+
120
+ print("We get the embeddings from local store")
121
+ else:
122
+ #progress(0.40, desc="Detected new document. Splitting and generating the embeddings")
123
+ print("We generate the embeddings using thir-party service")
124
+ # Get extracted running text
125
+ text = self.extract_text_clean(file_name, file_path)
126
+
127
+ # Configure the text splitter and embeddings
128
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=10, separators=[".", ",", " \n\n "])
129
+
130
+ # Split, and clean
131
+ texts = text_splitter.split_text(text)
132
+ for idx, text in enumerate(texts):
133
+ texts[idx] = text.replace('\n',' ')
134
+ print("Creating embeddings")
135
+ # Create an index search
136
+ docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])
137
+
138
+ # Extract and prepare tables
139
+ # progress(0.60, desc="Embeddings generated, parsing and transforming tables")
140
+ if (os.path.splitext(file_name)[1] == '.pdf'):
141
+ docsearch = await self.get_tables(docsearch,chain_table,file_path)
142
+
143
+ # Save the index locally
144
+ FAISS.save_local(docsearch, "./vectors/"+file_name)
145
+
146
+ return docsearch
147
+
148
+ def build_chains(self, apikey):
149
+ if not apikey:
150
+ apikey = openai.api_key
151
+ gr.Error("Please set your api key")
152
+ LLMClient = OpenAI(model_name='text-davinci-003',openai_api_key=apikey,temperature=0)
153
+ ## In-context prompt
154
+ prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
155
+ Question: {question}
156
+ ###
157
+ Context:
158
+ {context}
159
+ ###
160
+ Helpful answer:
161
+ """
162
+ in_context_prompt = PromptTemplate(
163
+ input_variables=["context","question"],
164
+ template=prompt_template,
165
+ )
166
+ chain_incontext = load_qa_chain(LLMClient, chain_type="stuff", prompt=in_context_prompt)
167
+
168
+ # Table extraction prompts
169
+ ## Table prompt to transform parsed tables in natural text
170
+ prompt_template = """Given the following table in HTML, and the given context related the table: Translate the content of the table into natural language.
171
+ ###
172
+ Context:
173
+ {context}
174
+ ###
175
+ Table: {table}
176
+ ###
177
+ Table translation:
178
+ """
179
+ table_prompt = PromptTemplate(
180
+ input_variables=["context","table"],
181
+ template=prompt_template,
182
+ )
183
+ chain_table = LLMChain(llm=LLMClient, prompt=table_prompt)
184
+
185
+ return chain_incontext, chain_table
186
+
187
+ async def async_table_generate(self, docs,table,chain):
188
+
189
+ resp = await chain.arun({"context": docs, "table": table})
190
+ #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
191
+ return resp
192
+
193
+ async def async_generate(self, dimension, docs,question,chain):
194
+ resp = await chain.arun({"input_documents": docs, "question": question})
195
+ #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
196
+ return [dimension, resp]
197
+
198
+ async def get_gathering_dimension(self, docsearch, incontext_prompt, retrieved_docs):
199
+ dimensions = [
200
+ {"Gathering description":"""Provide a summary of how the data of the dataset has been collected? Please avoid mention the annotation process or data preparation processes"""},
201
+ {"Gathering type":"""Which of the following types corresponds to the gathering process mentioned in the context?
202
+
203
+ Types: Web API, Web Scrapping, Sensors, Manual Human Curator, Software collection, Surveys, Observations, Interviews, Focus groups, Document analysis, Secondary data analysis, Physical data collection, Self-reporting, Experiments, Direct measurement, Interviews, Document analysis, Secondary data analysis, Physical data collection, Self-reporting, Experiments, Direct measurement, Customer feedback data, Audio or video recordings, Image data, Biometric data, Medical or health data, Financial data, Geographic or spatial data, Time series data, User-generated content data.
204
+
205
+ Answer with "Others", if you are unsure. Please answer with only the type"""},
206
+ {"Gathering team": """Who was the team who collect the data?"""},
207
+ {"Team Type": """The data was collected by an internal team, an external team, or crowdsourcing team?""" },
208
+ {"Team Demographics": "Are the any demographic information of team gathering the data?"},
209
+ {"Timeframe":""" Which are the timeframe when the data was collected?
210
+ If present, answer only with the collection timeframe of the data. If your are not sure, or there is no mention, just answers 'not provided'"""},
211
+ {"Sources": """Which is the source of the data during the collection process? Answer solely with the name of the source""" },
212
+ {"Infrastructure": """Which tools or infrastructure has been used during the collection process?"""},
213
+ {"Localization": """Which are the places where data has been collected?
214
+ If present, answer only with the collection timeframe of the data. If your are not sure, or there is no mention, just answers 'not provided'"""}
215
+
216
+ ]
217
+
218
+ results = []
219
+ for dimension in dimensions:
220
+ for title, question in dimension.items():
221
+ docs = docsearch.similarity_search(question, k=retrieved_docs)
222
+ results.append(self.async_generate(title, docs,question,incontext_prompt))
223
+
224
+ answers = await asyncio.gather(*results)
225
+
226
+ report = []
227
+ for result in answers:
228
+ if(result[0] == "Gathering description"):
229
+ classifications = self.classifier(result[1], ["Is a description of a process","do not know"])
230
+ if(classifications['labels'][0] == 'Do not know'):
231
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
232
+ report.append(result[0]+" is missing. Please provide an explanation of the data collection process")
233
+ if(result[0] == "Gathering type"):
234
+ classifications = self.classifier(result[1], ["Is others","Is not others"])
235
+ if(classifications['labels'][0] == 'Is others'):
236
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
237
+ report.append(result[0]+" is missing. The type cannot be inferred. Provide a better explanation of the gathering process")
238
+ if(result[0] == "Gathering team"):
239
+ classifications = self.classifier(result[1], ["Is a explanation of a team","Do not know"])
240
+ if(classifications['labels'][0] == 'Do not know'):
241
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
242
+ report.append(result[0]+" is missing. This information is relevant to evaluate the quality of the data")
243
+ if(result[0] == "Team Type"):
244
+ classifications = self.classifier(result[1], ["Is intenal, external or crowdsourcing","Do not know"])
245
+ if(classifications['labels'][0] == 'Do not know'):
246
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
247
+ report.append(result[0]+" is missing. This information is relevant to evaluate the quality of the data")
248
+ if(result[0] == "Team Demographics"):
249
+ classifications = self.classifier(result[1], ["Have demographics information","Do not have demographics information"])
250
+ if(classifications['labels'][0] == 'Do not have demographics information'):
251
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
252
+ report.append(result[0]+" is missing. This information is relevant to evaluate the quality of the labels")
253
+ if(result[0] == "Localization"):
254
+ classifications = self.classifier(result[1], ["Where data has been collected","unknown"])
255
+ if(classifications['labels'][0] == 'Is not a localization'):
256
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
257
+ report.append(result[0]+" is missing. Please indicate where the data has been collected")
258
+ if(result[0] == "Timeframe"):
259
+ classifications = self.classifier(result[1], ["It is a date","It is not a date"])
260
+ if(classifications['labels'][0] == 'It is not a date'):
261
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
262
+ report.append(result[0]+" is missing. Please indicate when the data has been collected")
263
+ if(result[0] == "Infrastructure"):
264
+ classifications = self.classifier(result[1], ["Is a tool or an infrastructure","Is not a tool or an infrastructure"])
265
+ if(classifications['labels'][0] == 'Is not a tool or an infrastructure'):
266
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
267
+ report.append(result[0]+" is missing. Please indicate the infrastructure used to collect the data")
268
+ if(result[0] == "Sources"):
269
+ classifications = self.classifier(result[1], ["Is source of data","Do not know"])
270
+ if(classifications['labels'][0] == 'Do not know'):
271
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
272
+ report.append(result[0]+" is missing. Please indicate the source used to collect the data")
273
+ if len(report) == 0:
274
+ report.append("No warnings")
275
+ completeness = 100
276
+ else:
277
+ completeness = round((1 - len(report)/len(answers))*100)
278
+ completeness_report = {"completeness":completeness,"report":report}
279
+ return answers, completeness_report
280
+
281
+ async def get_annotation_dimension(self, docsearch, incontext_prompt, retrieved_docs):
282
+ dimensions = [
283
+ {"Annotation description":"""How the data of the has been annotated or labelled? Provide a short summary of the annotation process"""},
284
+ {"Annotation type":""" Which of the following category corresponds to the annotation process mentioned in the context?
285
+ Categories: Bounding boxes, Lines and splines, Semantinc Segmentation, 3D cuboids, Polygonal segmentation, Landmark and key-point, Image and video annotations, Entity annotation, Content and textual categorization
286
+ If you are not sure, answer with 'others'. Please answer only with the categories provided in the context. """},
287
+ {"Labels":""" Which are the specific labels of the dataset? Can you enumerate it an provide a description of each one?"""},
288
+ {"Team Description": """Who has annotated the data?"""},
289
+ {"Team type": """The data was annotated by an internal team, an external team, or crowdsourcing team?""" },
290
+ {"Team Demographics": """Is there any demographic information about the team who annotate the data?"""},
291
+ {"Infrastructure": """Which tool has been used to annotate or label the dataset?"""},
292
+ {"Validation": """How the quality of the labels have been validated?""" }
293
+ ]
294
+
295
+ results = []
296
+ for dimension in dimensions:
297
+ for title, question in dimension.items():
298
+ docs = docsearch.similarity_search(question, k=retrieved_docs)
299
+ results.append(self.async_generate(title, docs,question,incontext_prompt))
300
+
301
+ answers = await asyncio.gather(*results)
302
+
303
+ ## Post-processing
304
+ report = []
305
+ for result in answers:
306
+ if(result[0] == "Annotation Description"):
307
+ classifications = self.classifier(result[1], ["Is a description of a process","Is unknown"])
308
+ if(classifications['labels'][0] == 'Is unkown'):
309
+ print("Dimension: "+result[0]+" is missing in the documentation")
310
+ report.append(result[0]+" is missing. Please, provide a better explanation of the annotation process")
311
+ if(result[0] == "Annotation Type"):
312
+ classifications = self.classifier(result[1], ["Is others","Is not others"])
313
+ if(classifications['labels'][0] == 'Is others'):
314
+ print("Dimension: "+result[0]+" is missing in the documentation")
315
+ report.append(result[0]+" is missing. The type of the annotation process cannot be infered form the documentation. Please, provide a better explanation of the process")
316
+ if(result[0] == "Labels"):
317
+ classifications = self.classifier(result[1], ["Labels explanation","do not know"])
318
+ if(classifications['labels'][0] == 'do not know'):
319
+ print("Dimension: "+result[0]+" is missing in the documentation")
320
+ report.append(result[0]+" is missing. Please provide a better explanation of the labels generated with the annotation process")
321
+ if(result[0] == "Team Description"):
322
+ classifications = self.classifier(result[1], ["Is a description of a team","Is not a description of a team"])
323
+ if(classifications['labels'][0] == 'Is not a description of a team'):
324
+ print("Dimension: "+result[0]+" is missing in the documentation")
325
+ report.append(result[0]+" is missing. This information is relevant to evaluate the quality of the labels")
326
+ if(result[0] == "Team Type"):
327
+ classifications = self.classifier(result[1], ["Is intenal, external or crowdsourcing","Do not know"])
328
+ if(classifications['labels'][0] == 'Do not know'):
329
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
330
+ report.append(result[0]+" is missing. This information is relevant to evaluate the quality of the data")
331
+ if(result[0] == "Team Demographics"):
332
+ classifications = self.classifier(result[1], ["Have demographics information","Do not have demographics information"])
333
+ if(classifications['labels'][0] == 'Do not have demographics information'):
334
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
335
+ report.append(result[0]+" is missing. This information is relevant to evaluate the quality of the labels")
336
+ if(result[0] == "Infrastructure"):
337
+ classifications = self.classifier(result[1], ["Is a tool or an infrastructure","Is not a tool or an infrastructure"])
338
+ if(classifications['labels'][0] == 'Is not a tool or an infrastructure'):
339
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
340
+ report.append(result[0]+" is missing. Please indicate the infrastructure used to annotate the data")
341
+ if(result[0] == "Validation"):
342
+ classifications = self.classifier(result[1], ["Is there a method","It is not a method"])
343
+ if(classifications['labels'][0] == 'Is is not a method'):
344
+ print("Dimension: "+result[0]+" is missing. Inserting a warning")
345
+ report.append(result[0]+" is missing. Please indicate how the annotation have been validated")
346
+ if len(report) == 0:
347
+ report.append("No warnings")
348
+ completeness = 100
349
+ else:
350
+ completeness = round((1 - len(report)/len(answers))*100)
351
+ completeness_report = {"completeness":completeness,"report":report}
352
+
353
+ return answers, completeness_report
354
+
355
+ async def get_social_concerns_dimension(self, docsearch, incontext_prompt, retrieved_docs):
356
+ dimensions = [
357
+ {"Representativeness":"""Are there any social group that could be misrepresented in the dataset?"""},
358
+ {"Biases":"""Is there any potential bias or imbalance in the data?"""},
359
+ {"Sensitivity":""" Are there sensitive data, or data that can be offensive for people in the dataset?"""},
360
+ {"Privacy":""" Is there any privacy issues on the data?"""},
361
+
362
+ ]
363
+
364
+ results = []
365
+ for dimension in dimensions:
366
+ for title, question in dimension.items():
367
+ docs = docsearch.similarity_search(question, k=retrieved_docs)
368
+ results.append(self.async_generate(title, docs,question,incontext_prompt))
369
+
370
+ answers = await asyncio.gather(*results)
371
+
372
+ ## Post-processing
373
+ report = []
374
+ for result in answers:
375
+ if(result[0] == "Representativeness"):
376
+ classifications = self.classifier(result[1], ["Representativeness","Not mentioned or do not know"])
377
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
378
+ print("Dimension: "+result[0]+" is missing in the documentation")
379
+ report.append(result[0]+" is missing. Is there any representativeness issue in your data?")
380
+ if(result[0] == "Biases"):
381
+ classifications = self.classifier(result[1], ["Is a bias explanation","Not mentioned or do not know"])
382
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
383
+ print("Dimension: "+result[0]+" is missing in the documentation")
384
+ report.append(result[0]+" is missing. Are you sure there is no potential bias in your data?")
385
+ if(result[0] == "Sensitivity"):
386
+ classifications = self.classifier(result[1], ["Explanation of sensibilty data issue","Not mentioned or do not know"])
387
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
388
+ print("Dimension: "+result[0]+" is missing in the documentation")
389
+ report.append(result[0]+" is missing. Are you sure there is no sensitivity data in your dataset?")
390
+ if(result[0] == "Privacy"):
391
+ classifications = self.classifier(result[1], ["Is privacy issue","Not mentioned or do not know"])
392
+ if(classifications['labels'][0] == "Not mentioned or do not know"):
393
+ print("Dimension: "+result[0]+" is missing in the documentation")
394
+ report.append(result[0]+" is missing. Are you sure there is no privacy issues in your data?")
395
+ if len(report) == 0:
396
+ report.append("No warnings")
397
+ completeness = 100
398
+ else:
399
+ completeness = round((1 - len(report)/len(answers))*100)
400
+ completeness_report = {"completeness":completeness,"report":report}
401
+
402
+ return answers, completeness_report
403
+
404
+ async def get_uses_dimension(self, docsearch, incontext_prompt, retrieved_docs):
405
+ dimensions = [
406
+ {"Purposes":"""Which are the purpose or purposes of the dataset?"""},
407
+ {"Gaps":"""Which are the gaps the dataset intend to fill?"""},
408
+ {"Task":"""Which machine learning tasks the dataset inteded for?:"""},
409
+ {"Recommended":"""For which applications the dataset is recommended?"""},
410
+ {"Non-Recommneded":"""Is there any non-recommneded application for the dataset? If you are not sure, or there is any non-recommended use of the dataset metioned in the context, just answer with "no"."""},
411
+ ]
412
+ results = []
413
+ for dimension in dimensions:
414
+ for title, question in dimension.items():
415
+ docs = docsearch.similarity_search(question, k=retrieved_docs)
416
+ if (title == "Task"):
417
+ question = """Which of the following ML tasks for the dataset best matches the context?
418
+
419
+ Tasks: text-classification, question-answering, text-generation, token-classification, translation,
420
+ fill-mask, text-retrieval, conditional-text-generation, sequence-modeling, summarization, other,
421
+ structure-prediction, information-retrieval, text2text-generation, zero-shot-retrieval,
422
+ zero-shot-information-retrieval, automatic-speech-recognition, image-classification, speech-processing,
423
+ text-scoring, audio-classification, conversational, question-generation, image-to-text, data-to-text,
424
+ classification, object-detection, multiple-choice, text-mining, image-segmentation, dialog-response-generation,
425
+ named-entity-recognition, sentiment-analysis, machine-translation, tabular-to-text, table-to-text, simplification,
426
+ sentence-similarity, zero-shot-classification, visual-question-answering, text_classification, time-series-forecasting,
427
+ computer-vision, feature-extraction, symbolic-regression, topic modeling, one liner summary, email subject, meeting title,
428
+ text-to-structured, reasoning, paraphrasing, paraphrase, code-generation, tts, image-retrieval, image-captioning,
429
+ language-modelling, video-captionning, neural-machine-translation, transkation, text-generation-other-common-sense-inference,
430
+ text-generation-other-discourse-analysis, text-to-tabular, text-generation-other-code-modeling, other-text-search
431
+
432
+ If you are not sure answer with just with "others".
433
+ Please, answer only with one or some of the provided tasks """
434
+
435
+ results.append(self.async_generate(title, docs,question,incontext_prompt))
436
+
437
+ answers = await asyncio.gather(*results)
438
+ ## Post-processing
439
+ report = []
440
+ for result in answers:
441
+ if(result[0] == "Purposes"):
442
+ classifications = self.classifier(result[1], ["Is there purposes","Not mentioned or do not know"])
443
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
444
+ print("Dimension: "+result[0]+" is missing in the documentation")
445
+ report.append(result[0]+" is missing. Please provide a better explanation of the purposes of the dataset")
446
+ if(result[0] == "Gaps"):
447
+ classifications = self.classifier(result[1], ["Gaps the dataset intends to fill","Not mentioned or do not know"])
448
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
449
+ print("Dimension: "+result[0]+" is missing in the documentation")
450
+ report.append(result[0]+" is missing. Which gaps this dataset intends to fill?")
451
+ if(result[0] == "Task"):
452
+ classifications = self.classifier(result[1], ["Is a task","Others"])
453
+ if(classifications['labels'][0] == 'Others'):
454
+ print("Dimension: "+result[0]+" is missing in the documentation")
455
+ report.append(result[0]+" is missing. The task of the dataset cannot be inferred, please provide a better explanation of its purposes?")
456
+ if(result[0] == "Recommended"):
457
+ classifications = self.classifier(result[1], ["Is a recommendation","Not mentioned or do not know"])
458
+ if(classifications['labels'][0] == "Not mentioned or do not know"):
459
+ print("Dimension: "+result[0]+" is missing in the documentation")
460
+ report.append(result[0]+" is missing. Which are the uses recommendation of your dataset?")
461
+ if(result[0] == "Non-Recommneded"):
462
+ classifications = self.classifier(result[1], ["Is a non-recommneded use","No non-recommended use"])
463
+ if(classifications['labels'][0] == "No non-recommended use"):
464
+ print("Dimension: "+result[0]+" is missing in the documentation")
465
+ report.append(result[0]+" is missing. Is there any non-recommended use of the data?")
466
+ if len(report) == 0:
467
+ report.append("No warnings")
468
+ completeness = 100
469
+ else:
470
+ completeness = round((1 - len(report)/len(answers))*100)
471
+ completeness_report = {"completeness":completeness,"report":report}
472
+
473
+ return answers, completeness_report
474
+
475
+ async def get_contributors_dimension(self, docsearch, incontext_prompt, retrieved_docs):
476
+ dimensions = [
477
+ {"Authors":"""Who are the authors of the dataset """},
478
+ {"Funders":"""Is there any organization which supported or funded the creation of the dataset?"""},
479
+ {"Maintainers":"""Who are the maintainers of the dataset?"""},
480
+ {"Erratums":"""Is there any data retention limit in the dataset? If you are not sure, or there is no retention limit just answer with "no"."""},
481
+ {"Data Retention Policies":"""Is there any data retention policies policiy of the dataset? If you are not sure, or there is no retention policy just answer with "no"."""},
482
+ ]
483
+
484
+ results = []
485
+ for dimension in dimensions:
486
+ for title, question in dimension.items():
487
+ docs = docsearch.similarity_search(question, k=retrieved_docs)
488
+ results.append(self.async_generate(title, docs,question,incontext_prompt))
489
+
490
+ answers = await asyncio.gather(*results)
491
+ ## Post-processing
492
+ report = []
493
+ for result in answers:
494
+ if(result[0] == "Authors"):
495
+ classifications = self.classifier(result[1], ["Authors","Not mentioned or do not know"])
496
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
497
+ print("Dimension: "+result[0]+" is missing in the documentation")
498
+ report.append(result[0]+" is missing. Authors cannot be identified")
499
+ if(result[0] == "Funders"):
500
+ classifications = self.classifier(result[1], ["Funders of the dataset","Not mentioned or do not know"])
501
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
502
+ print("Dimension: "+result[0]+" is missing in the documentation")
503
+ report.append(result[0]+" is missing. Who funded the dataset?")
504
+ if(result[0] == "Mantainers"):
505
+ classifications = self.classifier(result[1], ["Maintainers","Not mentioned or do not know"])
506
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
507
+ print("Dimension: "+result[0]+" is missing in the documentation")
508
+ report.append(result[0]+" is missing. Who were the maintainers of the dataset?")
509
+ if(result[0] == "Erratums"):
510
+ classifications = self.classifier(result[1], ["Is an Erratum","No erratum"])
511
+ if(classifications['labels'][0] == "No erratum"):
512
+ print("Dimension: "+result[0]+" is missing in the documentation")
513
+ report.append(result[0]+" is missing. Is there an erratum?")
514
+ if(result[0] == "Data Retention Policies"):
515
+ classifications = self.classifier(result[1], ["Data Retention","No data retention policy"])
516
+ if(classifications['labels'][0] == "No data retention policy"):
517
+ print("Dimension: "+result[0]+" is missing in the documentation")
518
+ report.append(result[0]+" is missing. Is there any data retention policy of the data?")
519
+ if len(report) == 0:
520
+ report.append("No warnings")
521
+ completeness = 100
522
+ else:
523
+ completeness = round((1 - len(report)/len(answers))*100)
524
+ completeness_report = {"completeness":completeness,"report":report}
525
+
526
+ return answers, completeness_report
527
+
528
+ async def get_composition_dimension(self, docsearch, incontext_prompt, retrieved_docs):
529
+ dimensions = [
530
+ {"File composition":"""Can you provide a description of each files the dataset is composed of?"""},
531
+ {"Attributes":"""Can you enumerate the different attributes present in the dataset? """},
532
+ {"Training splits":"""The paper mentions any recommended data split of the dataset?"""},
533
+ {"Relevant statistics":"""Are there relevant statistics or distributions of the dataset? """},
534
+ ]
535
+
536
+ results = []
537
+ for dimension in dimensions:
538
+ for title, question in dimension.items():
539
+ docs = docsearch.similarity_search(question, k=retrieved_docs)
540
+ results.append(self.async_generate(title, docs,question,incontext_prompt))
541
+
542
+ answers = await asyncio.gather(*results)
543
+
544
+ ## Post-processing
545
+ report = []
546
+ for result in answers:
547
+ if(result[0] == "File composition"):
548
+ classifications = self.classifier(result[1], ["A file composition","Not mentioned or do not know"])
549
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
550
+ print("Dimension: "+result[0]+" is missing in the documentation")
551
+ report.append(result[0]+" is missing. Provide a better explanation of the file composition of the dataset")
552
+ if(result[0] == "Attributes"):
553
+ classifications = self.classifier(result[1], ["Attributes explanation","Not mentioned or do not know"])
554
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
555
+ print("Dimension: "+result[0]+" is missing in the documentation")
556
+ report.append(result[0]+" is missing. Provide a better explanation of the attribute explanation of the dataset")
557
+ if(result[0] == "Training splits"):
558
+ classifications = self.classifier(result[1], ["A data split","Not mentioned or do not know"])
559
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
560
+ print("Dimension: "+result[0]+" is missing in the documentation")
561
+ report.append(result[0]+" is missing. Is there any recommended data split?")
562
+ if(result[0] == "Relevant statistics"):
563
+ classifications = self.classifier(result[1], ["A statistic","Not mentioned or do not know"])
564
+ if(classifications['labels'][0] == "Not mentioned or do not know"):
565
+ print("Dimension: "+result[0]+" is missing in the documentation")
566
+ report.append(result[0]+" is missing. Is there any relevant statistic?")
567
+ if len(report) == 0:
568
+ report.append("No warnings")
569
+ completeness = 100
570
+ else:
571
+ completeness = round((1 - len(report)/len(answers))*100)
572
+ completeness_report = {"completeness":completeness,"report":report}
573
+
574
+ return answers, completeness_report
575
+
576
+ async def get_distribution_dimension(self, docsearch, incontext_prompt, retrieved_docs):
577
+ dimensions = [
578
+ {"Data repository":"""Is there a link to the a repository containing the data? If you are not sure, or there is no link to the repository just answer with "no"."""},
579
+ {"Licence":"""Which is the license of the dataset. If you are not sure, or there is mention to a license of the dataset in the context, just answer with "no". """},
580
+ {"Deprecation policies":"""Is there any deprecation plan or policy of the dataset?
581
+ """},
582
+
583
+ ]
584
+
585
+ results = []
586
+ for dimension in dimensions:
587
+ for title, question in dimension.items():
588
+ docs = docsearch.similarity_search(question, k=retrieved_docs)
589
+ results.append(self.async_generate(title, docs,question,incontext_prompt))
590
+
591
+ answers = await asyncio.gather(*results)
592
+ ## Post-processing
593
+ report = []
594
+ for result in answers:
595
+ if(result[0] == "Data repository"):
596
+ classifications = self.classifier(result[1], ["A link to a repository","Not mentioned or do not know"])
597
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
598
+ print("Dimension: "+result[0]+" is missing in the documentation")
599
+ report.append(result[0]+" is missing. Where the data can be accessed?")
600
+ if(result[0] == "Licence"):
601
+ classifications = self.classifier(result[1], ["A License","Not mentioned or do not know"])
602
+ if(classifications['labels'][0] == 'Not mentioned or do not know'):
603
+ print("Dimension: "+result[0]+" is missing in the documentation")
604
+ report.append(result[0]+" is missing. Which is the license of the dataset")
605
+ if(result[0] == "Deprecation policies"):
606
+ classifications = self.classifier(result[1], ["A deprecation policy","No a deprecation policy"])
607
+ if(classifications['labels'][0] == 'No a deprecation policy'):
608
+ print("Dimension: "+result[0]+" is missing in the documentation")
609
+ report.append(result[0]+" is missing. Is there any deprecation policy of the dataset?")
610
+ if len(report) == 0:
611
+ report.append("No warnings")
612
+ completeness = 100
613
+ else:
614
+ completeness = round((1 - len(report)/len(answers))*100)
615
+ completeness_report = {"completeness":completeness,"report":report}
616
+
617
+ return answers, completeness_report
618
+
619
+