ola13 commited on
Commit
0f43f50
1 Parent(s): 0112a25

flagging to datasets

Browse files
app.py CHANGED
@@ -1,61 +1,39 @@
1
  import json
2
  import math
 
 
3
  from functools import partial
4
 
 
5
  import streamlit as st
6
  import streamlit.components.v1 as components
7
- from gforms import Form
8
 
9
  BAD_EXAMPLES_PATH = "bad_examples"
10
  DATA_PATH = "data"
11
- MAX_DOC_LENGTH = 30000
12
-
13
-
14
- def form_callback(
15
- element,
16
- page_index,
17
- element_index,
18
- dataset,
19
- docid,
20
- text,
21
- metadata,
22
- reason,
23
- person,
24
- part,
25
- ):
26
- if element.name == "Dataset":
27
- return dataset
28
- if element.name == "Datapoint ID":
29
- return docid
30
- if element.name == "Text":
31
- return text
32
- if element.name == "Metadata":
33
- return metadata
34
- if element.name == "Flagging Reason":
35
- return reason
36
- if element.name == "Flagging Person":
37
- return person
38
- if element.name == "Part":
39
- return part
40
-
41
-
42
- def report_result(dataset, docid, text, metadata, reason, person, part):
43
- form = Form()
44
- FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSedYTj1pBD5L6xo6qPUKY5vleNW183FXCgc3LSSgg3AUwZWKA/viewform"
45
- form.load(FORM_URL)
46
- form.fill(
47
- partial(
48
- form_callback,
49
- dataset=dataset,
50
- docid=docid,
51
- text=text,
52
- metadata=metadata,
53
- reason=reason,
54
- person=person,
55
- part=part,
56
- ),
57
  )
58
- form.submit()
59
 
60
 
61
  def load_jsonl(file_path):
@@ -84,7 +62,6 @@ def save_flag_and_get_next_item(sample, issue):
84
  f.write(json.dumps(sample) + "\n")
85
 
86
  text = sample["text"]
87
-
88
  sample.pop("text")
89
  sample.pop("issue")
90
  sample_id = ""
@@ -94,15 +71,7 @@ def save_flag_and_get_next_item(sample, issue):
94
  else:
95
  sample_id = sample["id"]
96
 
97
- if len(text) > MAX_DOC_LENGTH:
98
- num_parts = math.ceil(len(text) / MAX_DOC_LENGTH)
99
- for i in range(num_parts):
100
- text_portion = text[i * MAX_DOC_LENGTH : (i + 1) * MAX_DOC_LENGTH]
101
- report_result(
102
- dataset, sample_id, text_portion, str(sample), issue, "", str(i)
103
- )
104
- else:
105
- report_result(dataset, sample_id, text, str(sample), issue, "", str(0))
106
 
107
  get_next_item()
108
 
 
1
  import json
2
  import math
3
+ import os
4
+ import uuid
5
  from functools import partial
6
 
7
+ import jsonlines
8
  import streamlit as st
9
  import streamlit.components.v1 as components
10
+ from huggingface_hub import HfApi
11
 
12
  BAD_EXAMPLES_PATH = "bad_examples"
13
  DATA_PATH = "data"
14
+
15
+
16
+ def report_result_dataset(dataset, docid, text, metadata, reason, annotator):
17
+ with jsonlines.open("report.jsonl", "w") as f:
18
+ f.write(
19
+ {
20
+ "dataset": dataset,
21
+ "docid": docid,
22
+ "text": text,
23
+ "metadata": metadata,
24
+ "reason": reason,
25
+ "annotator": annotator,
26
+ }
27
+ )
28
+
29
+ api = HfApi()
30
+ api.upload_file(
31
+ path_or_fileobj="report.jsonl",
32
+ path_in_repo="report-{}.jsonl".format(uuid.uuid4()),
33
+ repo_id="HuggingFaceGECLM/data_feedback",
34
+ repo_type="dataset",
35
+ token=os.environ.get("geclm_token"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
 
37
 
38
 
39
  def load_jsonl(file_path):
 
62
  f.write(json.dumps(sample) + "\n")
63
 
64
  text = sample["text"]
 
65
  sample.pop("text")
66
  sample.pop("issue")
67
  sample_id = ""
 
71
  else:
72
  sample_id = sample["id"]
73
 
74
+ report_result_dataset(dataset, sample_id, text, str(sample), issue, "")
 
 
 
 
 
 
 
 
75
 
76
  get_next_item()
77
 
bad_examples/reddit_threaded_bad_examples.jsonl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90cccebb836615224b151fe1576ad3667933d425bc16e0e8f231671e151b0dbb
3
- size 2971
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:562ed8ca881c564329b0cb138863cdedfd8339913635ddd1f4c733fc723b3230
3
+ size 13634
report.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93030e14be30172f1313437e54c47c138d99b91cedefc965f88ba1e5c6025c6
3
+ size 2250
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
- gforms
 
2
  streamlit==1.20.0
 
1
+ huggingface_hub
2
+ jsonlines
3
  streamlit==1.20.0