Spaces:

poyum
/

test_discut

Sleeping

App Files Files Community

poyum commited on Sep 8

Commit

f709e5e

1 Parent(s): 2dfb14b

gradio space init

Browse files

Files changed (6) hide show

disrpt_eval_2025.py +517 -0
disrpt_io.py +846 -0
eval.py +760 -0
pipeline.py +142 -0
reading.py +512 -0
utils.py +216 -0

disrpt_eval_2025.py ADDED Viewed

	@@ -0,0 +1,517 @@

+"""
+Script to evaluate segmentation f-score and perfect discourse unit segmentation proportion from two files. Two input formats are permitted:
+  * One token per line, with ten columns, no sentence breaks (default *.tok format) - segmentation indicated in column 10
+  * The same, but with blank lines between sentences (*.conll format)
+Token columns follow the CoNLL-U format, with token IDs in the first column and pipe separated key=value pairs in the last column.
+Document boundaries are indicated by a comment: # newdoc_id = ...
+The evaluation uses micro-averaged F-Scores per corpus (not document macro average).
+Example:
+# newdoc_id = GUM_bio_byron
+1	Education	_	_	_	_	_	_	_	Seg=B-seg
+2	and	_	_	_	_	_	_	_	_
+3	early	_	_	_	_	_	_	_	_
+4	loves	_	_	_	_	_	_	_	_
+5	Byron	_	_	_	_	_	_	_	Seg=B-seg
+6	received	_	_	_	_	_	_	_	_
+Or:
+# newdoc_id = GUM_bio_byron
+# sent_id = GUM_bio_byron-1
+# text = Education and early loves
+1	Education	education	NOUN	NN	Number=Sing	0	root	_	Seg=B-seg
+2	and	and	CCONJ	CC	_	4	cc	_	_
+3	early	early	ADJ	JJ	Degree=Pos	4	amod	_	_
+4	loves	love	NOUN	NNS	Number=Plur	1	conj	_	_
+# sent_id = GUM_bio_byron-2
+# text = Byron received his early formal education at Aberdeen Grammar School, and in August 1799 entered the school of Dr. William Glennie, in Dulwich. [17]
+1	Byron	Byron	PROPN	NNP	Number=Sing	2	nsubj	_	Seg=B-seg
+2	received	receive	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
+For PDTB-style corpora, we calculate exact span-wise f-scores for BIO encoding, without partial credit. In other words,
+predicting an incorrect span with partial overlap is the same as missing a gold span and predicting an incorrect span
+somewhere else in the corpus. Note also that spans must begin with B-Conn - predicted spans beginning with I-Conn are ignored.
+The file format for PDTB style corpora is similar, but with different labels:
+1	Fidelity	Fidelity	PROPN	NNP	_	6	nsubj	_	_
+2	,	,	PUNCT	,	_	6	punct	_	_
+3	for	for	ADP	IN	_	4	case	_	Seg=B-Conn
+4	example	example	NOUN	NN	_	6	obl	_	Conn=I-conn
+5	,	,	PUNCT	,	_	6	punct	_	_
+6	prepared	prepare	VERB	VBN	_	0	root	_	_
+7	ads	ad	NOUN	NNS	_	6	obj	_	_
+Arguments:
+ * goldfile: shared task gold test data
+ * predfile: same format, with predicted segments positions in column 10 - note **number of tokens must match**
+ * string_input: if specified, files are replaced by strings with file contents instead of file names
+ * no_boundaries: specify to eval only intra-sentence EDUs
+"""
+""" TODO
+- OK labels : en argument, pas en dur
+- OK option sans ls débuts de phrases : cf script "BIO no B'
+- OK imprimer les résultats + propre : sans le "o" bizarre
+- OK faire 2 classes edu et connectives (conn: futur exp for eval connective extended vs head of connective)
+- solution + propre pour la colonne des labels ?
+- faire une classe Eval et transformer les 2 en Eval en sous-classes
+"""
+__author__ = "Amir Zeldes, Janet Liu, Laura Rivière"
+__license__ = "Apache 2.0"
+__version__ = "2.0.0"
+import io, os, sys, argparse
+import json
+from sklearn.metrics import accuracy_score, classification_report
+# MWE and ellips : no lab or "_"
+# TODO :
+# print scores *100: 0.6825 => 68.25
+# documentation (automatic generation ?)
+# testunitaire
+class Evaluation:
+	"""
+	Generic class for evaluation between 2 files.
+	:load data, basic check, basic metrics, print results.
+	"""
+	def __init__(self, name: str) -> None:
+		self.output = dict()
+		self.name = name
+		self.report = ""
+		self.fill_output('doc_name', self.name)
+	def get_data(self, infile: str, str_i=False) -> str:
+		"""
+		Stock data from file or stream.
+		"""
+		if str_i == False:
+			data = io.open(infile, encoding="utf-8").read().strip().replace("\r", "")
+		else:
+			data = infile.strip()
+		return data
+	def fill_output(self, key: str, value) -> None:
+		"""
+		Fill results dict that will be printed.
+		"""
+		self.output[key] = value
+	def check_tokens_number(self, g: list, p: list) -> None:
+		"""
+		Check same number of tokens/labels in both compared files.
+		"""
+		if len(g) != len(p):
+			self.report += "\nFATAL: different number of tokens detected in gold and pred:\n"
+			self.report += ">>>  In " + self.name + ": " + str(len(g)) + " gold tokens but " + str(len(p)) + " predicted tokens\n\n"
+			sys.stderr.write(self.report)
+			sys.exit(0)
+	def check_identical_tokens(self, g: list, p: list) -> None:
+		"""
+		Check tokens/features are identical.
+		"""
+		for i, tok in enumerate(g):
+			if tok != p[i]:
+				self.report += "\nWARN: token strings do not match in gold and pred:\n"
+				self.report += ">>> First instance in " + self.name + " token " + str(i) + "\n"
+				self.report += "Gold: " + tok + " but Pred: " + p[i] + "\n\n"
+				sys.stderr.write(self.report)
+				break
+	def compute_PRF_metrics(self, tp: int, fp: int, fn: int) -> None:
+		"""
+		Compute Precision, Recall, F-score from True Positive, False Positive and False Negative counts.
+		Save result in dict.
+		"""
+		try:
+			precision = tp / (float(tp) + fp)
+		except Exception as e:
+			precision = 0
+		try:
+			recall = tp / (float(tp) + fn)
+		except Exception as e:
+			recall = 0
+		try:
+			f_score = 2 * (precision * recall) / (precision + recall)
+		except:
+			f_score = 0
+		self.fill_output("gold_count", tp + fn )
+		self.fill_output("pred_count", tp + fp )
+		self.fill_output("precision", precision)
+		self.fill_output("recall", recall)
+		self.fill_output("f_score", f_score)
+	def compute_accuracy(self, g: list, p: list, k: str) -> None:
+		"""
+		Compute accuracy of predictions list of items, against gold list of items.
+		:g: gold list
+		:p: predicted list
+		:k: name detail of accuracy
+		"""
+		self.fill_output(f"{k}_accuracy", accuracy_score(g, p) )
+		self.fill_output(f"{k}_gold_count", len(g) )
+		self.fill_output(f"{k}_pred_count", len(p) )
+	def classif_report(self, g: list, p: list, key: str) -> None:
+		"""
+		Compute Precision, Recall and f-score for each instances of gold list.
+		"""
+		stats_dict = classification_report(g, p, labels=sorted(set(g)), zero_division=0.0, output_dict=True)
+		self.fill_output(f'{key}_classification_report', stats_dict)
+	def print_results(self) -> None:
+		"""
+		Print dict of saved results.
+		"""
+		# for k in self.output.keys():
+		# print(f">> {k} : {self.output[k]}")
+		print(json.dumps(self.output, indent=4))
+class RelationsEvaluation(Evaluation):
+	"""
+	Specific evaluaion class for relations classification.
+	The evaluation uses the simple accuracy score per corpus.
+	:rels disrpt-style data.
+	:default eval last column "label"
+	:option eval relation type (pdtb: implicit, explicit...) column "rel_type"
+	"""
+	HEADER = "doc\tunit1_toks\tunit2_toks\tunit1_txt\tunit2_txt\tu1_raw\tu2_raw\ts1_toks\ts2_toks\tunit1_sent\tunit2_sent\tdir\trel_type\torig_label\tlabel"
+	# HEADER_23 = "doc\tunit1_toks\tunit2_toks\tunit1_txt\tunit2_txt\ts1_toks\ts2_toks\tunit1_sent\tunit2_sent\tdir\torig_label\tlabel"
+	LABEL_ID = -1
+	TYPE_ID = -3
+	DISRPT_TYPES = ['Implicit', 'Explicit', 'AltLex', 'AltLexC', 'Hypophora']
+	def __init__(self, name: str, gold_path: str, pred_path: str, str_i=False, rel_type=False) -> None:
+		super().__init__(name)
+		"""
+		:param gold_file: Gold shared task file
+		:param pred_file: File with predictions
+		:param string_input: If True, files are replaced by strings with file contents (for import inside other scripts)
+		:param rel_type: If True, scores are computed on types column, not label (relevant for PDTB)
+		"""
+		self.mode = "rel"
+		self.g_path = gold_path
+		self.p_path = pred_path
+		self.opt_str_i = str_i
+		self.opt_rel_t = rel_type
+		self.key = "labels"
+		self.fill_output("options", {"s": self.opt_str_i, "rt": self.opt_rel_t})
+	def compute_scores(self) -> None:
+		"""
+		Get lists of data to compare, compute metrics.
+		"""
+		gold_units, gold_labels, gold_types = self.parse_rels_data(self.g_path, self.opt_str_i, self.opt_rel_t)
+		pred_units, pred_labels, pred_types = self.parse_rels_data(self.p_path, self.opt_str_i, self.opt_rel_t)
+		self.check_tokens_number(gold_labels, pred_labels)
+		self.check_identical_tokens(gold_units, pred_units)
+		self.compute_accuracy(gold_labels, pred_labels, self.key)
+		self.classif_report(gold_labels, pred_labels, self.key)
+		if self.opt_rel_t:
+			self.get_types_scores(gold_labels, pred_labels, gold_types)
+	def get_types_scores(self, g: list, p: list, tg: list) -> None:
+		"""
+		This function is to obtain scores of predictions against gold labels, by types of relations.
+		"""
+		for t in self.DISRPT_TYPES:
+			gold_t = []
+			pred_t = []
+			for i, _ in enumerate(g):
+				if tg[i] == t.lower():
+					gold_t.append(g[i])
+					pred_t.append(p[i])
+			self.compute_accuracy(gold_t, pred_t, f"types_{t}")
+	def parse_rels_data(self, path: str, str_i: bool, rel_t: bool) -> tuple[list[str], list[str]]:
+		"""
+		Rels format from DISRPT = header, then one relation classification instance per line.
+		:LREC_2024_header = 15 columns.
+		"""
+		data = self.get_data(path, str_i)
+		header = data.split("\n")[0]
+		assert header == self.HEADER, "Unrecognized .rels header."
+		#column_ID = self.TYPE_ID if rel_t == True else self.LABEL_ID
+		rels = data.split("\n")[1:]
+		labels = [line.split("\t")[self.LABEL_ID] for line in rels] ######## .lower()
+		units = [" ".join(line.split("\t")[:3]) for line in rels]
+		types = [line.split("\t")[self.TYPE_ID] for line in rels] if rel_t == True else []
+		return units, labels, types
+class ConnectivesEvaluation(Evaluation):
+	"""
+	Specific evaluation class for PDTB connectives detection.
+	:parse conllu-style data
+	:eval upon strict connectives spans
+	"""
+	LAB_CONN_B = "Conn=B-conn"		# "Seg=B-Conn" 	#
+	LAB_CONN_I = "Conn=I-conn"		# "Seg=I-Conn" 	#
+	LAB_CONN_O = "Conn=O"			# "_"	#
+	def __init__(self, name:str, gold_path:str, pred_path:str, str_i=False) -> None:
+		super().__init__(name)
+		"""
+		:param gold_file: Gold shared task file
+		:param pred_file: File with predictions
+		:param string_input: If True, files are replaced by strings with file contents (for import inside other scripts)
+		"""
+		self.mode = "conn"
+		self.seg_type = "connective spans"
+		self.g_path = gold_path
+		self.p_path = pred_path
+		self.opt_str_i = str_i
+		self.fill_output('seg_type', self.seg_type)
+		self.fill_output("options", {"s": self.opt_str_i})
+	def compute_scores(self) -> None:
+		"""
+		Get lists of data to compare, compute metrics.
+		"""
+		gold_tokens, gold_labels, gold_spans = self.parse_conn_data(self.g_path, self.opt_str_i)
+		pred_tokens, pred_labels, pred_spans = self.parse_conn_data(self.p_path, self.opt_str_i)
+		self.output['tok_count'] = len(gold_tokens)
+		self.check_tokens_number(gold_tokens, pred_tokens)
+		self.check_identical_tokens(gold_tokens, pred_tokens)
+		tp, fp, fn = self.compare_spans(gold_spans, pred_spans)
+		self.compute_PRF_metrics(tp, fp, fn)
+	def compare_spans(self, gold_spans: tuple, pred_spans: tuple) -> tuple[int, int, int]:
+		"""
+		Compare exact spans.
+		"""
+		true_positive = 0
+		false_positive = 0
+		false_negative = 0
+		for span in gold_spans: # not verified
+			if span in pred_spans:
+				true_positive +=1
+			else:
+				false_negative +=1
+		for span in pred_spans:
+			if span not in gold_spans:
+				false_positive += 1
+		return true_positive, false_positive, false_negative
+	def parse_conn_data(self, path:str, str_i:bool) -> tuple[list, list, list]:
+		"""
+		LABEL = in last column
+		"""
+		data = self.get_data(path, str_i)
+		tokens = []
+		labels = []
+		spans = []
+		counter = 0
+		span_start = -1
+		span_end = -1
+		for line in data.split("\n"):  # this loop is same than version 1
+			if line.startswith("#") or line == "":
+				continue
+			else:
+				fields = line.split("\t") # Token
+				label = fields[-1]
+				if "-" in fields[0] or "." in fields[0]:  # Multi-Word Expression or Ellips : No pred shall be there....
+					continue
+				elif self.LAB_CONN_B in label:
+					if span_start > -1:  # add span
+						if span_end == -1:
+							span_end = span_start
+						spans.append((span_start,span_end))
+						span_end = -1
+					label = self.LAB_CONN_B
+					span_start = counter
+				elif self.LAB_CONN_I in label:
+					label = self.LAB_CONN_I
+					span_end = counter
+				else:
+					label = "_"
+					if span_start > -1:  # Add span
+						if span_end == -1:
+							span_end = span_start
+						spans.append((span_start,span_end))
+						span_start = -1
+						span_end = -1
+				tokens.append(fields[1])
+				labels.append(label)
+				counter += 1
+		if span_start > -1 and span_end > -1:  # Add last span
+			spans.append((span_start,span_end))
+		if not self.LAB_CONN_B in labels:
+			print(f"Unrecognized labels. Expecting: {self.LAB_CONN_B}, {self.LAB_CONN_I}, {self.LAB_CONN_O}...")
+			print("maybe the model is so bad it can't find a B")
+		return tokens, labels, spans
+class SegmentationEvaluation(Evaluation):
+	"""
+	Specific evaluation class for EDUs segmentation.
+	:parse conllu-style data
+	:eval upon first token identification
+	"""
+	LAB_SEG_B = "Seg=B-seg"		# "BeginSeg=Yes"
+	LAB_SEG_I = "Seg=O"			# "_"
+	def __init__(self, name: str, gold_path: str, pred_path: str, str_i=False, no_b=False) -> None:
+		super().__init__(name)
+		"""
+		:param gold_file: Gold shared task file
+		:param pred_file: File with predictions
+		:param string_input: If True, files are replaced by strings with file contents (for import inside other scripts)
+		"""
+		self.mode = "edu"
+		self.seg_type = "EDUs"
+		self.g_path = gold_path
+		self.p_path = pred_path
+		self.opt_str_i = str_i
+		self.no_b = True if "conllu" in gold_path.split(os.sep)[-1] and no_b == True else False  # relevant only in conllu
+		self.fill_output('seg_type', self.seg_type)
+		self.fill_output("options", {"s": self.opt_str_i})
+	def compute_scores(self) -> None:
+		"""
+		Get lists of data to compare, compute metrics.
+		"""
+		gold_tokens, gold_labels, gold_spans = self.parse_edu_data(self.g_path, self.opt_str_i, self.no_b)
+		pred_tokens, pred_labels, pred_spans = self.parse_edu_data(self.p_path, self.opt_str_i, self.no_b)
+		self.output['tok_count'] = len(gold_tokens)
+		self.check_tokens_number(gold_tokens, pred_tokens)
+		self.check_identical_tokens(gold_tokens, pred_tokens)
+		tp, fp, fn = self.compare_labels(gold_labels, pred_labels)
+		self.compute_PRF_metrics(tp, fp, fn)
+	def compare_labels(self, gold_labels: list, pred_labels: list) -> tuple[int, int, int]:
+		"""
+		"""
+		true_positive = 0
+		false_positive = 0
+		false_negative = 0
+		for i, gold_label in enumerate(gold_labels): # not verified
+			pred_label = pred_labels[i]
+			if gold_label == pred_label:
+				if gold_label == "_":
+					continue
+				else:
+					true_positive += 1
+			else:
+				if pred_label == "_":
+					false_negative += 1
+				else:
+					if gold_label == "_":
+						false_positive += 1
+					else:  # I-Conn/B-Conn mismatch ?
+						false_positive +=1
+		return true_positive, false_positive, false_negative
+	def parse_edu_data(self, path: str, str_i: bool, no_b: bool) -> tuple[list, list, list]:
+		"""
+		LABEL = in last column
+		"""
+		data = self.get_data(path, str_i)
+		tokens = []
+		labels = []
+		spans = []
+		counter = 0
+		span_start = -1
+		span_end = -1
+		for line in data.split("\n"):  # this loop is same than version 1
+			if line.startswith("#") or line == "":
+				continue
+			else:
+				fields = line.split("\t")  # Token
+				label = fields[-1]
+				if "-" in fields[0] or "." in fields[0]:  # Multi-Word Expression or Ellipsis : No pred shall be there....
+					continue
+				elif no_b == True and fields[0] == "1":
+					label = "_"
+				elif self.LAB_SEG_B in label:
+					label = self.LAB_SEG_B
+				else:
+					label = "_"  # 🚩
+					if span_start > -1:  # Add span
+						if span_end == -1:
+							span_end = span_start
+						spans.append((span_start, span_end))
+						span_start = -1
+						span_end = -1
+				tokens.append(fields[1])
+				labels.append(label)
+				counter += 1
+		if span_start > -1 and span_end > -1:  # Add last span
+			spans.append((span_start, span_end))
+		if not self.LAB_SEG_B in labels:
+			exit(f"Unrecognized labels. Expecting: {self.LAB_SEG_B}, {self.LAB_SEG_I}...")
+		return tokens, labels, spans
+if __name__ == "__main__":
+	p = argparse.ArgumentParser()
+	p.add_argument("-g", "--goldfile", required=True, help="Shared task gold file in .tok or .conll or .rels format.")
+	p.add_argument("-p", "--predfile", required=True, help="Corresponding file with system predictions.")
+	p.add_argument("-t", "--task", required=True, choices=['S', 'C', 'R'], help="Choose one of the three options: S (EDUs Segmentation), C (Connectives Detection), R (Relations Classification)")
+	p.add_argument("-s", "--string_input",action="store_true",help="Whether inputs are file names or strings.")
+	p.add_argument("-nb", "--no_boundary_edu", default=False, action='store_true', help="Does not count EDU that starts at beginning of sentence.")
+	p.add_argument("-rt", "--rel_type", default=False, action='store_true', help="Eval relations types instead of label.")
+	# help(Evaluation)
+	# help(SegmentationEvaluation)
+	# help(ConnectivesEvaluation)
+	# help(RelationsEvaluation)
+	opts = p.parse_args()
+	name = opts.goldfile.split(os.sep)[-1] if os.path.isfile(opts.goldfile) else f"string_input: {opts.goldfile[0:20]}..."
+	if opts.task == "R":
+		my_eval = RelationsEvaluation(name, opts.goldfile, opts.predfile, opts.string_input, opts.rel_type)
+	elif opts.task == "C":
+		my_eval = ConnectivesEvaluation(name, opts.goldfile, opts.predfile, opts.string_input)
+	elif opts.task == "S":
+		my_eval = SegmentationEvaluation(name, opts.goldfile, opts.predfile, opts.string_input, opts.no_boundary_edu)
+	my_eval.compute_scores()
+	my_eval.print_results()

disrpt_io.py ADDED Viewed

	@@ -0,0 +1,846 @@

+"""
+Classes to read/write disrpt-like files
++ analysis of sentence splitter / "gold" sentences or stanza/spacy sentences
+    - ersatz
+Disrpt is a discourse analysis campaign with (as of 2023):
+ - discourse segmentation information, in a conll-like format
+ - discourse connective information (also conll-like)
+ - discourse relations pairs, in a specific format
+data are separated by corpora and language with conventionnal names
+as language.framework.corpusname
+eg fra.srdt.annodis
+TODO:
+   - refactor how sentences are stored with dictionary: "connlu" / "tok" / "split"
+        [ok] dictionary
+        ? refactor creation of corpus/documents to allow for update (or load tok+conllu at once)
+   - [ok] italian luna corpus has different meta tags avec un niveau supplémentaire: newdoc_id/newturn_id/newutterance_id
+   - [ok] check behaviour on languages without pretrained models/what candidates ?
+        - nl, pt, it -> en?
+        - thai -> multilingual
+   - test different candidates sets for splitting locations:
+        - [done] all -> trop sous-spécifié et trop lent
+        - [ok] en on all but zho+thai
+        - (done] en à la place de multilingual ?
+            bad scores on zho
+    - [ok] fix bad characters: BOM, replacement char etc
+            spécial char for apostrophe, cf
+            data_clean/eng.dep.scidtb/eng.dep.scidtb_train.tok / newdoc_id = P16-1030 prob de char pour possessif
+            ��antagonist��
+            pb basque: "Osasun-zientzietako Ikertzaileen II ." nb tokens ...
+                Iru�eko etc
+    - pb turk: tur.pdtb.tdb/tur.pdtb.tdb_train: BOM ? '\ufeff' -> 'Makale'
+            + extra blanc dans train (785)?
+            774	olduğunu	_	_	_	_	_	_	_	_
+            775	söylüyor	_	_	_	_	_	_	_	_
+            776	:	_	_	_	_	_	_	_	_
+            777	Türkiye	_	_	_	_	_	_	_	_
+            778	demokrasi	_	_	_	_	_	_	_	_
+            779	istiyor	_	_	_	_	_	_	_	_
+            780	ÖDPGenel	_	_	_	_	_	_	_	_
+            781	Başkanı	_	_	_	_	_	_	_	_
+            782	Ufuk	_	_	_	_	_	_	_	_
+            783	Uras'tan	_	_	_	_	_	_	_	_
+            784	:	_	_	_	_	_	_	_	_
+            785		_	_	_	_	_	_	_	_
+            786	Türkiye	_	_	_	_	_	_	_	_
+            787	,	_	_	_	_	_	_	_	_
+            788	AİHM'de	_	_
+    - pb zh
+        zh: ？是 is this "?"  listed in ersatz ?
+        ??hosto2
+        sctb 3.巴斯克
+    %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+   - specific preproc:
+        annodis/gum: titles
+        gum/rrt : biblio / articles
+        scidtb ?
+   - different sentence splitters
+        - [ok] ersatz
+        - trankit
+        - [abandoned] stanza: FIXME: lots of errors done by stanza eg split within words (might be due to bad input tokenization)
+   - [done] write doc in disrt format (after transformation for instance)
+   - [done] eval of beginning of sentences (precision)
+   - [done] (done in split_sentence script) eval / nb sentences connl  ~= recall sentences
+   - eval length sentences (max)
+   - [moot] clean main script : arguments/argparse -> script à part
+   - [done] method for sentence splitting (for tok)
+   - [done] iterate all docs in corpus
+   - [done] choose language according to corpus name automatically
+   - ?method for sentence resplitting for conllu ? needs ways of indexing tokens for later reeval ? or eval script does not care ?
+candidate sets for splitting:
+    - multilingual (default) is as described in ersatz paper == [EOS punctuation][!number]
+    - en requires a space following punctuation
+    - all: a space between any two characters
+    - custom can be written that uses the determiner.Split() base class
+"""
+import sys, os
+import dataclasses
+from itertools import chain
+from collections import Counter
+from copy import copy, deepcopy
+from tqdm import tqdm
+#import progressbar
+#from ersatz import split, utils
+# import trankit
+#import stanza
+#from stanza.pipeline.core import DownloadMethod
+from transformers import pipeline
+from wtpsplit import SaT
+# needed to track the mistakes made in preprocessing of the disrpt dataset, whose origin is unknown
+BOM = '\ufeff'
+REPL_CHAR = "\ufffd" # �
+test_doc_seg = """# newdoc id = geop_3_space
+1	La	le	DET	_	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	2	det	_	BeginSeg=Yes
+2	Space	space	PROPN	_	_	0	root	_	_
+3	Launcher	Launcher	PROPN	_	_	2	flat:name	_	_
+4	Initiative	initiative	PROPN	_	_	2	flat:name	_	_
+5	.	.	PUNCT	_	_	2	punct	_	_
+1	Le	le	DET	_	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	2	det	_	BeginSeg=Yes
+2	programme	programme	NOUN	_	Gender=Masc|Number=Sing	10	nsubj	_	_
+3	de	de	ADP	_	_	4	case	_	_
+4	Space	space	PROPN	_	_	2	nmod	_	_
+5	Launcher	Launcher	PROPN	_	_	4	flat:name	_	_
+6	Initiative	initiative	PROPN	_	_	4	flat:name	_	_
+7	(	(	PUNCT	_	_	8	punct	_	BeginSeg=Yes
+8	SLI	SLI	PROPN	_	_	4	appos	_	_
+9	)	)	PUNCT	_	_	8	punct	_	_
+10	vise	viser	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	BeginSeg=Yes
+11	à	à	ADP	_	_	12	mark	_	_
+12	développer	développer	VERB	_	VerbForm=Inf	10	ccomp	_	_
+13	un	un	DET	_	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	14	det	_	_
+14	système	système	NOUN	_	Gender=Masc|Number=Sing	12	obj	_	_
+15	de	de	ADP	_	_	16	case	_	_
+16	lanceur	lanceur	NOUN	_	Gender=Masc|Number=Sing	14	nmod	_	_
+17	réutilisable	réutilisable	ADJ	_	Gender=Masc|Number=Sing	16	amod	_	_
+18	entièrement	entièrement	ADV	_	_	19	advmod	_	_
+19	inédit	inédit	ADJ	_	Gender=Masc|Number=Sing	14	amod	_	_
+20	.	.	PUNCT	_	_	10	punct	_	_
+# newdoc id = ling_fuchs_section2
+1	Théorie	théorie	PROPN	_	_	0	root	_	BeginSeg=Yes
+2	psychomécanique	psychomécanique	ADJ	_	Gender=Masc|Number=Sing	1	amod	_	_
+3	et	et	CCONJ	_	_	4	cc	_	_
+4	cognition	cognition	NOUN	_	Gender=Fem|Number=Sing	1	conj	_	_
+5	.	.	PUNCT	_	_	1	punct	_	_
+"""
+# token is just a simple record type
+Token = dataclasses.make_dataclass("Token","id form lemma pos xpos morph head_id dep_type extra label".split(),
+                                   namespace={'__repr__': lambda self: self.form,
+                                              'format': lambda self: ("\t".join(map(str,dataclasses.astuple(self)))),
+                                              # ignored for now cos we just get rid of MWE when reading disrpt file
+                                              # but could be changed in the future
+                                              #'is_MWE': lambda self: type(self.id) is str and "-" in self.id,
+                                              }
+                                   )
+class Sentence:
+    def __init__(self,token_list,meta):
+        self.toks = token_list
+        self.meta = meta
+        # Added by Firmin or chloe ?
+        self.label_start = ["Seg=B-conn", "Seg=B-seg"]
+        self.label_end = ["Seg=I-conn", "Seg=O"]
+    def __iter__(self):
+        return iter(self.toks)
+    def __len__(self):
+        return len(self.toks)
+    def display(self,segment=False):
+        """if segment option set to true, print sentences with marking of EDUs"""
+        if segment:
+            output = [f"{'|' if token.label=='Seg=B-seg' else ''}{token.form}" for token in self]
+            # output = [f"{'|' if token.label=='BeginSeg=Yes' else ''}{token.form}" for token in self]
+            return " ".join(output)+"|"
+        else:
+            return self.meta["text"]
+    def __in__(self,word):
+        for token in self.toks:
+            if token.form == word:
+                return True
+        return False
+    def __repr__(self):
+        return self.display()
+    def format(self):
+        meta = f"# sent_id = {self.meta['sent_id']}\n" + f"# text = {self.meta['text']}\n"
+        output = "\n".join([t.format() for t in self.toks])
+        return meta+output
+# not necessary because of trankit auto-mode but probably safer at some point
+# why dont they use normalized language codes !!??
+TRANKIT_LANG_MAP = {
+    "de": "german",
+    "en":"english",
+    # to be tested
+    "gum": "english-gum",
+    "fr":"french",
+    "it": "italian",
+    "sp": "spanish",
+    "es": "spanish",
+    "eu": "basque",
+    "zh": "chinese",
+    "ru": "russian",
+    "tr": "turkish",
+    "pt":"portuguese",
+    "fa": "persian",
+    "nl":"dutch",
+    # blah
+}
+lg_map = {"sp":"es",
+          "po":"pt",
+          "tu":"tr"}
+def get_language(lang,model):
+    lang = lang[:2]
+    if lang in lg_map:
+        lang = lg_map[lang]
+    if model=="ersatz":
+        if lang not in ersatz_languages:
+            lang = "default-multilingual"
+    if model=="trankit":
+        lang = TRANKIT_LANG_MAP.get(lang,"auto")
+    return lang
+# This is taken from ersatz https://github.com/rewicks/ersatz/blob/master/ersatz/candidates.py
+# sentence ending punctuation
+# U+0964  ।   Po  DEVANAGARI DANDA
+# U+061F  ؟   Po  ARABIC QUESTION MARK
+# U+002E  .   Po  FULL STOP
+# U+3002  。  Po  IDEOGRAPHIC FULL STOP
+# U+0021  !   Po  EXCLAMATION MARK
+# U+06D4  ۔   Po  ARABIC FULL STOP
+# U+17D4  ។   Po  KHMER SIGN KHAN
+# U+003F  ?   Po  QUESTION MARK
+# U+2026 ...  Po  Ellipsis
+# U+30FB
+# U+002A *
+# other acceptable punctuation
+# U+3011  】  Pe  RIGHT BLACK LENTICULAR BRACKET
+# U+00BB  »   Pf  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+# U+201D  "   Pf  RIGHT DOUBLE QUOTATION MARK
+# U+300F  』  Pe  RIGHT WHITE CORNER BRACKET
+# U+2018  ‘   Pi  LEFT SINGLE QUOTATION MARK
+# U+0022  "   Po  QUOTATION MARK
+# U+300D  」  Pe  RIGHT CORNER BRACKET
+# U+201C  "   Pi  LEFT DOUBLE QUOTATION MARK
+# U+0027  '   Po  APOSTROPHE
+# U+2019  ’   Pf  RIGHT SINGLE QUOTATION MARK
+# U+0029  )   Pe  RIGHT PARENTHESIS
+ending_punc = {
+    '\u0964',
+    '\u061F',
+    '\u002E',
+    '\u3002',
+    '\u0021',
+    '\u06D4',
+    '\u17D4',
+    '\u003F',
+    '\uFF61',
+    '\uFF0E',
+    '\u2026',
+}
+closing_punc = {
+    '\u3011',
+    '\u00BB',
+    '\u201D',
+    '\u300F',
+    '\u2018',
+    '\u0022',
+    '\u300D',
+    '\u201C',
+    '\u0027',
+    '\u2019',
+    '\u0029'
+}
+list_set = {
+    '\u30fb',
+    '\uFF65',
+    '\u002a', # asterisk
+    '\u002d',
+    '\u4e00'
+}
+class Document:
+    _hard_punct = {"default":{".",";","?","!"}| ending_punc,
+                   "zh": {"。","？"}
+                   }
+    def __init__(self,sentence_list,meta,src="conllu"):
+        self.sentences = {src:sentence_list}
+        self.meta = meta
+    def __repr__(self):
+        # ADDED (chloe) the if : else of file type
+        if "tok" in self.sentences:
+            return "\n".join(map(repr,self.sentences.get("conllu",self.sentences["tok"])))
+        elif "conllu" in self.sentences:
+            return "\n".join(map(repr,self.sentences.get("conllu",self.sentences["conllu"])))
+        else:
+            sys.exit("Unknown type of file: "+str(self.sentences.keys()))
+    def get_sentences(self,src="tok"):
+        return self.sentences[src]
+    def baseline_split(self,lang="default"):
+        """default split for languages where we have issues re-aligning tokens for various reasons
+        this just splits at every token that is a hard punctuations
+        FIXME : this is not complete
+        """
+        sentence_id = 1
+        sentences = []
+        current = []
+        orig_doc = self.sentences["tok"][0]
+        for token in orig_doc:
+            current.append(token)
+            if token.lemma in self._hard_punct.get(lang,"default"):
+                sentences.append(Sentence(current,meta))
+                meta = {"doc_id":orig_doc.meta["doc_id"],
+                    "sent_id" : sentence_id,
+                    "text": " ".join([x.form for x in current])
+                    }
+                current = []
+                sentence += 1
+        if current!=[]:
+            meta = {"doc_id":orig_doc.meta["doc_id"],
+                    "sent_id" : sentence_id,
+                    "text": " ".join([x.form for x in current])
+                    }
+            sentences.append(Sentence(current,meta))
+        return sentences
+    def cutoff_split(self,cutoff=120,lang="default"):
+        """
+        default split for corpora with little or no punctuation (transcription etc)
+        just make a new sentence as soon as more than cutoff tokens
+        """
+        sentence_id = 1
+        sentences = []
+        current = []
+        current_cpt = 1
+        orig_doc = self.sentences["tok"][0]
+        meta = {"doc_id":orig_doc.meta["doc_id"],
+                "sent_id" : sentence_id,
+                }
+        for token in orig_doc:
+            token.id = current_cpt
+            current_cpt += 1
+            current.append(token)
+            #print(token, token.id)
+            if len(current) >= cutoff:
+                #print(orig_doc.meta["doc_id"],token,current)
+                meta = {"doc_id":orig_doc.meta["doc_id"],
+                    "sent_id" : sentence_id,
+                    "text": " ".join([x.form for x in current])
+                    }
+                sentences.append(Sentence(current,meta))
+                current = []
+                sentence_id += 1
+                current_cpt = 1
+        if current!=[]:
+            meta = {"doc_id":orig_doc.meta["doc_id"],
+                    "sent_id" : sentence_id,
+                    "text": " ".join([x.form for x in current])
+                    }
+            sentences.append(Sentence(current,meta))
+        return sentences
+    def ersatz_split(self,doc,lang='default-multilingual',candidates="en"):
+        result = split(model=lang,
+                       text=doc, output=None,
+                       batch_size=16,
+                       candidates=candidates,#'multilingual',
+                       cpu=True, columns=None, delimiter='\t')
+        return result
+    def stanza_split(self,orig_doc,lang):
+        nlp = stanza.Pipeline(lang=lang, processors='tokenize',download_method=DownloadMethod.REUSE_RESOURCES)
+        doc = nlp(orig_doc)
+        sentences = []
+        for s in doc.sentences:
+            sentences.append(" ".join([t.text for t in s.tokens]))
+        return sentences
+        #for i, sentence in enumerate(doc.sentences): for token in sentence.tokens / token.text
+    def trankit_split(self,orig_doc,lang,pipeline):
+        trk_sentences = pipeline.ssplit(orig_doc)
+        sentences = []
+        for s in trk_sentences["sentences"]:
+            sentences.append(s["text"])
+        return sentences
+    def sat_split(self, orig_doc, sat_model):
+        sat_sentences = sat_model.split( str(orig_doc) )
+        sentences = []
+        for s in sat_sentences:
+            sentences.append(s)
+        return sentences
+    # TODO: debug option to for warnings on/off
+    def _remap_tokens(self,split_sentences):
+        """remap tokens from sentence splitting to the token original information"""
+        #return split_sentences
+        # if this fails, there's been a bug: count of tokens is different in original text, and total
+        # of split sentences
+        # TODO: this is bound to happen, but the output should keep the original token count; how ?
+        # TODO: REALIGN by detecting split tokens
+        orig_token_nb = sum(map(len,self.sentences["tok"]))
+        split_token_nb = len(list(chain(*[x.split() for x in split_sentences])))
+        try:
+            assert orig_token_nb==split_token_nb
+        except:
+            print("WARNING wrong nb of tokens",orig_token_nb,"initially but",split_token_nb,"after split",file=sys.stderr)
+        #raise NotImplementedError
+        new_sentences = []
+        position = 0
+        skip_first_token = False
+        # will only work when splitting tok files, not resplitting conllu
+        orig_doc = self.sentences["tok"][0]
+        for i,s in enumerate(split_sentences):
+            new_toks = s.split()
+            if skip_first_token:# see below
+                new_toks = new_toks[1:]
+            toks = orig_doc.toks[position:position+len(new_toks)]
+            meta = {"doc_id":orig_doc.meta["doc_id"],
+                    "sent_id" : i+1,
+                    "text": " ".join([x.form for x in toks])
+                    }
+            new_tok_position = position
+            shift = 0 # advance thru new tokens in case of erroneous splits
+            # actual nb of tokens to advance in the original document
+            # new tokens might include split token by mistake (tricky)
+            new_toks_length = len(new_toks)
+            for j in range(len(toks)):
+                toks[j].id = j+1
+                new_j = j + shift
+                try:
+                    assert toks[j].form==new_toks[new_j]
+                    # a split token has been detected meaning it had a punctuation sign in it and makes a "fake" sentence
+                    # it will be recovered in current sentence so should be skipped in the next one
+                    skip_first_token = False
+                except:
+                    # TODO: check next token can be recovered
+                    # pb with chinese punctuation difference codes ?
+                    #print(f"WARNING === Token mismatch: {j,toks[j].form,new_toks[new_j]} \n {toks} \n {new_toks}",file=sys.stderr)
+                    # first case: within the same sentence (unlikely if a token was split by a punctuation)
+                    if j!= len(toks)-1:
+                        if len(toks[j].form)!=len(new_toks[new_j]): # if same length this is probably just an encoding problem (chinese cases) so just ignore it
+                            #print(f"INFO: split token still within the sentence {j,toks[j].form,new_toks[new_j]} ... should not happen",file=sys.stderr)
+                            if toks[j].form==new_toks[new_j]+new_toks[new_j+1]:
+                                #print(f"INFO: split token correctly identified as {j,toks[j].form,new_toks[new_j]+new_toks[new_j+1]} ... advancing to next one",file=sys.stderr)
+                                shift = shift + 1
+                    # second case: the sentence ends here and next token is in the next split sentence, which necessarily exists (?)
+                    else:
+                        if i+1<len(split_sentences):
+                            next_sentence = split_sentences[i+1]
+                            next_token = split_sentences[i+1].split()[0]
+                            skip_first_token = True
+                            if toks[j].form==new_toks[new_j]+next_token:
+                                pass
+                                #print(f"INFO: token can be recoverd: ",end="",file=sys.stderr)
+                            else:
+                                pass
+                                #print(f"INFO: token can still not be recoverd: ",end="",file=sys.stderr)
+                            #print(toks[j].form,new_toks[new_j]+next_token,file=sys.stderr)
+                        else:
+                            pass
+                            #print(f"WARNING === unmatched token at end of document",new_toks[new_j],file=sys.stderr)
+                            # in theory should not happen
+                    # the next starting position has to be put back ? no
+                    # position = position - 1
+            if len(toks)>0: # joining the first token might have generated an empty sentence
+                new_sentences.append(Sentence(toks,meta))
+                position = position + len(new_toks) - shift
+            else:
+                skip_first_token = False
+        split_token_nb = sum( [len(s.toks) for s in new_sentences] )
+        #print( "split_token_nb", split_token_nb)
+        try:
+            assert orig_token_nb==split_token_nb
+        except:
+            print("ERROR wrong nb of tokens",orig_token_nb,"originally but",split_token_nb,"after split+remap",file=sys.stderr)
+            sys.exit()
+        return new_sentences
+    def sentence_split(self,model="ersatz",lang="default-multilingual",**kwargs):
+        """
+        call the sentence splitter to the actual document read as one from a tok file.
+        kwargs might contain an open "pipeline" (eg. trankit pipeline) to pass on downstream for splitting sentences, so that it is not re-created for each paragraph
+        """
+        # if we split, the doc has been read as only one sentence
+        # we ignore multi-word-expression at reading time, but if this needs to be changed, it will impact this line:
+        doc = [x.form for x in self.sentences["tok"][0]] # if not(x.is_MWE())]
+        doc = " ".join(doc)
+        if model=="ersatz":
+            # empirically seems better: "en" for all alphabet-based language
+            # (candidates = candidate location for sentence splitting)
+            # not to be confused with the language of the model
+            candidates = "en" if lang not in {"zh","th"} else "multilingual"
+            new_sentences = self.ersatz_split(doc,lang=lang,candidates=candidates)
+        elif model=="stanza":
+            new_sentences = self.stanza_split(doc,lang=lang)
+        elif model=="trankit":# initiliazed pipeline is passed on here
+            new_sentences = self.trankit_split(doc,lang=lang,**kwargs)
+        elif model=="baseline":
+            new_sentences = self.baseline_split(lang=lang)
+            self.sentences["split"] = new_sentences
+        elif model=="sat":
+            sat_model = kwargs.get("sat_model")
+            if sat_model is None:
+                raise ValueError("sat_model must be provided for SAT sentence splitting.")
+            new_sentences = self.sat_split(doc, sat_model)
+            self.sentences["split"] = new_sentences
+        elif model == "cutoff":# FIXME should be a way to pass on the cutoff
+            new_sentences = self.cutoff_split(lang=lang)
+            self.sentences["split"] = new_sentences
+        else:
+            raise NotImplementedError
+        if model!="baseline" and model!="cutoff":
+            self.sentences["split"] = self._remap_tokens(new_sentences)
+        return self.sentences["split"]
+    def search_word(self,word):
+        return [s for s in self.sentences.get("split",[]) if word in s]
+    def format(self,mode="split"):
+        """format the document as disrpt format
+        mode=original (sentences) or split (split_sentences)
+        """
+        target = self.sentences[mode]
+        output = "\n".join([s.format()+"\n" for s in target])
+        meta = f"# doc_id = {self.meta}\n"
+        return meta+output #+"\n"
+class Corpus:
+    META_types = {"newdoc_id":"doc_id",
+                  "newdoc id":"doc_id",
+                  "doc_id":"doc_id",
+                  "sent_id":"sent_id",
+                  "newturn_id":"newturn_id",
+                  "newutterance":"newutterance",
+                  "newutterance_id":"newutterance_id",
+                  "text":"text",
+                  }
+    def __init__(self,data=None):
+        """input to constructor is a string
+        """
+        if data:
+            self.docs = self._parse(data.split("\n"))
+    @staticmethod
+    def _meta_parse(data_line):
+        """ parse comments as they contain meta information"""
+        if not("=" in data_line):# not a meta line
+            return "",""
+        info, value = data_line[1:].strip().split("=",1)
+        info = info.strip()
+        if info in Corpus.META_types:
+            meta_type = Corpus.META_types[info]
+        else:# TODO should send a warning
+            #print("WARNING: bad meta line",info, value,data_line,file=sys.stderr) -> this is just flooding the output
+            meta_type, value = "",""
+        return meta_type,value.strip()
+    def search_doc(self,docid):
+        return [x for x in self.docs if x.meta==docid]
+    def _parse(self,data_lines,src="tok"):
+        """parse disrpt segmentation/connective files"""
+        curr_token_list = []
+        sentences = []
+        docs = []
+        s_idx = 0
+        doc_idx = 0
+        meta = {}
+        for data_line in data_lines:
+            data_line = data_line.strip()
+            if data_line:
+                # comments always include some meta info of the form "metatype = value", minimally the document id
+                if data_line.startswith("#"):
+                    meta_type,value = Corpus._meta_parse(data_line)
+                    # start of a new doc, save previous one if it exists
+                    if meta_type=="doc_id":
+                        # print( doc_idx)
+                        if doc_idx>0:
+                            # print(src)
+                            docs.append(Document(sentences,meta["doc_id"],src=src))
+                        sentences = []
+                        curr_token_list = []
+                        s_idx = 0
+                        meta = {}
+                        doc_idx += 1
+                    if meta_type!="":
+                        meta[meta_type] = value
+                else:
+                    token, label = self.parse_token(meta, data_line)
+                    # print(token, label)
+                    # if this is a MWE, just ignore it. MWE have ids combining original token ids, eg "30-31"
+                    # TODO: refactor in parse_token + boolean flag if ok
+                    if not("-" in token[0]) and not("." in token[0]):
+                        curr_token_list.append(Token(*token,label))
+            else:# end of sentence
+                meta["text"] = " ".join((x.form for x in curr_token_list))
+                s_idx += 1
+                # some corpora dont have ids for sentences
+                if "sent_id" not in meta:
+                    meta["sent_id"] = s_idx
+                sentences.append(Sentence(curr_token_list,meta))
+                curr_token_list = []
+                meta = {"doc_id":meta["doc_id"]}
+        if len(curr_token_list)>0 or len(sentences)>0:# final sentence for final document
+            meta["text"] = " ".join((x.form for x in curr_token_list))
+            sentences.append(Sentence(curr_token_list,meta))
+            #print("="*50)
+            #print(meta.keys())
+            #print(len(curr_token_list),len(sentences))
+            docs.append(Document(sentences,meta["doc_id"],src=src))
+            # print(src)
+        return docs
+    def format(self, file=None, mode="split"):
+        output = "\n\n".join([doc.format(mode=mode) for doc in self.docs])
+        if file:
+            os.makedirs(os.path.dirname(file), exist_ok=True)
+            with open(file, "w", encoding="utf-8") as f:
+                f.write(output)
+        return output
+    def parse_token(self, meta, data_line):
+        *token, label = data_line.split("\t")
+        if len(token)==8:
+            print("ERROR: missing label ",meta,token,file=sys.stderr)
+            token = token + [label]
+            label = '_'
+        # needed because of errors in source of some corpora (russian with BOM kept as token; also bad reading of some chars)
+        # to prevent token counts/tokenization from failing, they are replaced with '_'
+        # token[1] is the form of the token
+        if token[1] == BOM: token[1]="_"
+        #if token[1] == '200�000':
+        #    print("GOTCHA")
+        token[1] = token[1].replace(REPL_CHAR,"_")
+        label_set = set(label.split("|"))
+        label = (label_set & set(self.LABELS))
+        if label==set():
+            label= "_"
+        else:
+            label = label.pop()
+        return token,label
+    def from_file(self,filepath):
+        """
+        reads a conllu or tok file
+        connlu has sentences, tok does not
+        option to pass on a string instead of file path, mostly for testing
+        TODO: should be a static method
+        """
+        self.filepath = filepath
+        basename = os.path.basename(filepath)
+        src = basename.split(".")[-1] # tok or connlu or split
+        #print("src = ",src)
+        with open(filepath,"r",encoding="utf8") as f:
+            data_lines = f.readlines()
+        self.docs = self._parse(data_lines,src=src)
+        # for sent in self.docs:
+        #     print( sent )
+    def from_string(self, text: str, src="conllu"):
+        """
+        Lit directement à partir d'une string (utile pour tests ou génération dynamique).
+        src peut être 'conllu', 'tok', ou 'split' pour indiquer le format.
+        """
+        self.filepath = None
+        if isinstance(text, str):
+            data_lines = text.strip().splitlines()
+        else:
+            raise ValueError("from_string attend une chaîne de caractères")
+        self.docs = self._parse(data_lines, src=src)
+        def format(self,mode="split",file=sys.stdout):
+            if type(file)==str:
+                os.makedirs(os.path.dirname(file), exist_ok=True)
+                file = open(file,"w")
+            for d in self.docs:
+                print(d.format(mode=mode),file=file)
+    def align(self,filepath):
+        """load conllu for corresponding tok file"""
+        pass
+    def sentence_split(self,model="ersatz",lang="default-multilingual",**kwargs):
+        """apply a sentence splitter to the document, assuming this was read from
+        a .tok file
+        kwargs might contain an open "pipeline" (eg. trankit pipeline) to pass on downstream for splitting sentences, so that it is not re-created for each paragraph
+        """
+        for doc in tqdm(self.docs):
+            doc.sentence_split(model=model,lang=lang,**kwargs)
+    def eval_sentences(self,mode="split"):
+        """eval sentence beginning as segment beginning
+        TODO rename -> precision
+        only .tok for now but could be used to eval re-split of connlu
+        more complex for pdtb: need to align tok and connlu
+        """
+        tp = 0
+        total_s = 0
+        labels = []
+        for doc in self.docs:
+            for s in doc.get_sentences(mode):
+                if len(s.toks)==0:
+                    print("WARNING empty sentence in ",s.meta,file=sys.stderr)
+                    break
+                tp += (s.toks[0].label=="Seg=B-seg")
+                # tp += (s.toks[0].label=="BeginSeg=Yes")
+                total_s += 1
+                labels.extend([x.label for x in s])
+        counts = Counter(labels)
+        # return tp, total_s, counts["BeginSeg=Yes"]
+        return tp, total_s, counts["Seg=B-seg"]
+class SegmentCorpus(Corpus):
+    LABELS = ["Seg=O","Seg=B-seg"]
+class ConnectiveCorpus(Corpus):
+    LABELS = ['Conn=O', 'Conn=B-conn', 'Conn=I-conn']
+    id2label = {i: label for i, label in enumerate( LABELS )}
+    label2id = {v: k for k,v in id2label.items()}
+class RelationCorpus(Corpus):
+    def from_file(self,filepath):
+        pass
+# ersatz existing language-specific models
+# for ersatz 1.0.0:
+# ['en', 'ar', 'cs', 'de', 'es', 'et', 'fi', 'fr', 'gu', 'hi', 'iu', 'ja',
+# 'kk', 'km', 'lt', 'lv', 'pl', 'ps', 'ro', 'ru', 'ta', 'tr', 'zh', 'default-multilingual']
+# missing disrpt languages/what candidates ? nl, pt, it -> en?  thai -> multilingual
+if __name__=="__main__":
+    # testing
+    import sys, os
+    from pathlib import PurePath
+    # from ersatz import split, utils
+    # ersatz existing language-specific models
+    # languages = utils.MODELS.keys()
+    sat = SaT("sat-3l") # 3L is better with French guillemets
+    #print(corpus.docs[0].sentences[11].display(segment=True))
+    print( sat.split("This is a test This is another test.") )
+    if len(sys.argv)>1:
+        test_path = sys.argv[1]
+    else:
+        test_path = "../jiant/tests/test_data/eng.pdtb.pdtb/eng.pdtb.pdtb_debug.tok"
+    # test_path = "../jiant/tests/test_data/eng.pdtb.pdtb/eng.pdtb.pdtb_debug.tok"
+    basename = os.path.basename(test_path)
+    lang = basename.split(".")[0]
+    # lang = get_language(lang,"trankit")
+    path = PurePath(test_path)
+    #output_path = str(path.with_suffix(".split"))
+    output_path = "out"
+    if "pdtb" in test_path:
+        corpus = ConnectiveCorpus()
+    else:
+        corpus = SegmentCorpus()
+    corpus.from_file(test_path)
+    sat = SaT("sat-3l") # 3L is better with French guillemets
+    #print(corpus.docs[0].sentences[11].display(segment=True))
+    print( sat.split("This is a test This is another test.") )
+    doc1 = corpus.docs[0]
+    s0 = doc1.sentences["tok"][0]
+    print(doc1)
+    print(list(sat.split(str(doc1))))
+    # list(res)
+    # pipe = pipeline("token-classification", model="segment-any-text/sat-1l")
+    # res = doc1.sentence_split(model="sat")
+    # ------------------------------------------
+    # -- From SaT DOC
+    # https://github.com/segment-any-text/wtpsplit?tab=readme-ov-file#usage
+    # sat = SaT("sat-3l")
+    # optionally run on GPU for better performance
+    # also supports TPUs via e.g. sat.to("xla:0"), in that case pass `pad_last_batch=True` to sat.split
+    # sat.half().to("cuda")
+    # print( sat.split("This is a test This is another test.") )
+    # returns ["This is a test ", "This is another test."]
+    # # do this instead of calling sat.split on every text individually for much better performance
+    # sat.split(["This is a test This is another test.", "And some more texts..."])
+    # # returns an iterator yielding lists of sentences for every text
+    # # use our '-sm' models for general sentence segmentation tasks
+    # sat_sm = SaT("sat-3l-sm")
+    # sat_sm.half().to("cuda") # optional, see above
+    # sat_sm.split("this is a test this is another test")
+    # # returns ["this is a test ", "this is another test"]
+    # # use trained lora modules for strong adaptation to language & domain/style
+    # sat_adapted = SaT("sat-3l", style_or_domain="ud", language="en")
+    # sat_adapted.half().to("cuda") # optional, see above
+    # sat_adapted.split("This is a test This is another test.")
+    # # returns ['This is a test ', 'This is another test']
+    # check that number of token is conserved by sentence splitting
+    # #assert sum(map(len,doc1.sentences))==len(list(chain(*[x.split() for x in res])))
+    # pipeline = trankit.Pipeline(lang,gpu=True)
+    # corpus.sentence_split(model="trankit",lang=lang,pipeline=pipeline)
+    corpus.sentence_split(model="sat", sat_model=sat)
+    tp, tot, all = corpus.eval_sentences()
+    print(tp, tot, all)
+    #print(corpus.docs[0].split_sentences[0].toks[0].format())
+    corpus.format(file=output_path)

eval.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import os, sys
+import numpy as np
+import transformers
+import utils
+import reading
+SUBTOKEN_START = '##'
+'''
+TODOs:
+- for now, if the dataset is cached, can t use word ids and the predictions
+written are not based on original eval file, thus not exactly same number
+of tokens (ignore contractions) --> doesn t work in disrpt eval script
+Change in newest version of transformers:
+from seqeval.metrics import accuracy_score
+from seqeval.metrics import classification_report
+from seqeval.metrics import f1_score
+'''
+def simple_eval( dataset_eval, model_checkpoint, tokenizer, output_path,
+		config, trace=False ):
+	'''
+	Run the pre-trained model on the (dev) dataset to get predictions,
+	then write the predictions in an output file.
+	Parameters:
+	-----------
+	datasets: dict of DatasetDisc
+		The datasets read
+	model_checkpoint: str
+		path to the saved model
+	tokenizer: Tokenizer
+		tokenizer of the saved model (TODO: retrieve from model? or should be removed?)
+	output_path: str
+		path to the output directory where prediction files will be written
+	data_collator: DataCollator
+		(TODO: retrieve from model?)
+	'''
+	# Retrieve predictions (list of list of 0 and 1)
+	print("\n-- PREDICT on:", dataset_eval.annotations_file )
+	model_checkpoint = os.path.normpath(model_checkpoint)
+	print("model_checkpoint", model_checkpoint)
+	preds_from_model, label_ids, metrics = retrieve_predictions( model_checkpoint,
+											   dataset_eval, output_path, tokenizer, config )
+	print("preds_from_model.shape", preds_from_model.shape)
+	print("label_ids.shape", label_ids.shape)
+	# - Compute metrics
+	print("\n-- COMPUTE METRICS" )
+	compute_metrics = utils.prepare_compute_metrics( dataset_eval.LABEL_NAMES_BIO )
+	metrics=compute_metrics([preds_from_model, label_ids])
+	max_preds_from_model = np.argmax(preds_from_model, axis=-1)
+	# - Write predictions:
+	pred_file = os.path.join( output_path, dataset_eval.basename+'.preds' )
+	print("\n-- WRITE PREDS in:", pred_file )
+	pred_file_success = True
+	try:
+		try:
+			# * retrieving the original words: will fail if cache not emptied
+			print( "Write predictions based on words")
+			predictions = align_tokens_labels_from_wordids( max_preds_from_model, dataset_eval,
+													tokenizer)
+			write_pred_file( dataset_eval.annotations_file, pred_file, predictions, trace=trace )
+		except IndexError:
+			# if error, we print the predictions with tokens, trying to merge subtokens
+			# based on SUBTOKEN_START and labels at -100
+			print( "Write predictions based on model tokenisation" )
+			aligned_tokens, aligned_golds, aligned_preds = align_tokens_labels_from_subtokens(
+				max_preds_from_model, dataset_eval, tokenizer, pred_file, trace=trace )
+			write_pred_file_from_scratch( aligned_tokens, aligned_golds, aligned_preds,
+								pred_file, trace=trace )
+	except Exception as e:
+		print( "Problem when trying to write predictions in file", pred_file )
+		print( "Exception:", e )
+		print("we skip the prediction writing step")
+		pred_file_success=False
+	if pred_file_success:
+		print( "\n-- EVAL DISRPT script" )
+		clean_pred_path = pred_file.replace('.preds', '.cleaned.preds')
+		utils.clean_pred_file(pred_file, clean_pred_path)
+		utils.compute_metrics_dirspt( dataset_eval, clean_pred_path, task=config['task'] )
+	# except:
+	# 	print("Problem when trying to compute scores with DISRPT eval script")
+	return metrics
+	# - Test DISRPT eval script
+	# try:
+def write_pred_file(annotations_file, pred_file, predictions, trace=False):
+	'''
+	Write a file containing the predictions based on the original annotation file.
+	It takes each line in the original evaluation file and append the prediction at
+	the end. Predictions and original tokens need to be perfectly aligned.
+	Parameters:
+	-----------
+	annotations_file: str | file path OR raw text
+		Path to the original evaluation file, or the text content itself
+	pred_file: str
+		Path to the output prediction file
+	predictions: list of str
+		Flat list of all predictions (DISRPT format) for all tokens in eval
+	'''
+	count_pred_B, count_gold_B = 0, 0
+	count_line_dash = 0
+	count_line_dot = 0
+	# --- Déterminer si annotations_file est un chemin ou du texte brut
+	if os.path.isfile(annotations_file):
+		with open(annotations_file, 'r', encoding='utf-8') as fin:
+			mylines = fin.readlines()
+	else:
+		# Considérer que c’est une string brute
+		mylines = annotations_file.strip().splitlines()
+	os.makedirs(os.path.dirname(pred_file), exist_ok=True)
+	with open(pred_file, 'w', encoding='utf-8') as fout:
+		count = 0
+		if trace:
+			print("len(predictions)", len(predictions))
+		for l in mylines:
+			l = l.strip()
+			if l.startswith("#"):  # Keep metadata
+				fout.write(l + '\n')
+			elif l == '' or l == '\n':  # keep line break
+				fout.write('\n')
+			elif '-' in l.split('\t')[0]:  # Keep lines for contractions but no label
+				if trace:
+					print("WARNING: line with - in token, no label will be added")
+				count_line_dash += 1
+				fout.write(l + '\t' + '_' + '\n')
+			# strange case in GUM
+			elif '.' in l.split('\t')[0]:  # Keep lines no label
+				count_line_dot += 1
+				if trace:
+					print("WARNING: line with . in token, no label will be added")
+				fout.write(l + '\t' + '_' + '\n')
+			else:
+				if 'B' in predictions[count]:
+					count_pred_B += 1
+				if 'Seg=B-seg' in l or 'Conn=B-conn' in l:
+					count_gold_B += 1
+				fout.write(l + '\t' + predictions[count] + '\n')
+				count += 1
+	print("Count the number of predictions corresponding to a B", count_pred_B, "vs Gold B", count_gold_B)
+	print("Count the number of lines with - in token", count_line_dash)
+	print("Count the number of lines with . in token", count_line_dot)
+def write_pred_file_from_scratch( aligned_tokens, aligned_golds, aligned_preds, pred_file, trace=False ):
+	'''
+	Write a prediction file based on a alignment between tokenisation and predictions.
+	Since we are not sur that we retrieved the exact alignment, the writing here is not based
+	on the original annotation file, but we use a similar format:
+	# Sent ID
+	tok_ID token gold_label pred_label
+	The use of the DISRPT script will show whther the alignment worked or not ...
+	Parameters:
+	----------
+	aligned_XX: list of list of str
+		The tokens / preds / golds for each sentence
+	'''
+	count_pred_B, count_gold_B = 0, 0
+	with open( pred_file, 'w' ) as fout:
+		if trace:
+			print( 'len tokens', len(aligned_tokens))
+			print("len(predictions)", len(aligned_preds))
+			print( 'len(golds)', len(aligned_preds))
+		for s, tok_sent in enumerate( aligned_tokens ):
+			fout.write( "# sent_id = "+str(s)+"\n" )
+			for i, tok in enumerate( tok_sent ):
+				g = aligned_golds[s][i]
+				p = aligned_preds[s][i]
+				fout.write( '\t'.join([str(i), tok, g, p])+'\n' )
+				if 'B' in p:
+					count_pred_B += 1
+				if 'Seg=B-seg' in g or 'Conn=B-conn' in g:
+					count_gold_B += 1
+			fout.write( "\n" )
+	print("Count the number of predictions corresponding to a B", count_pred_B, "vs Gold B", count_gold_B)
+def align_tokens_labels_from_wordids( preds_from_model, dataset_eval, tokenizer, trace=False ):
+	'''
+	Write predictions for segmentation or connective tasks in an output files.
+	The output is the same as the input gold file, with an additional column
+	corresponding to the predicted label.
+	Easiest way (?): use word_ids information to merge the words that been split et
+	retrieve the original tokens from the input .tok / .conllu files and run
+	evaluation --> but not kept in the cached datasets
+	Parameters:
+	-----------
+	preds_from_model: list of int
+		The predicted labels (numeric ids)
+	dev: DatasetDisc
+		Dataset for evalusation
+	pred_file: str
+		Path to the file where predictions will be written
+	Return:
+	-------
+	predictions: list of String
+		The predicted labels (DISRPT format) for each original input word
+	'''
+	word_ids = dataset_eval.all_word_ids
+	id2label = dataset_eval.id2label
+	predictions = []
+	for i in range( preds_from_model.shape[0] ):
+		sent_input_ids = dataset_eval.tokenized_datasets['input_ids'][i]
+		tokens = dataset_eval.dataset['tokens'][i]
+		sent_tokens = tokenizer.decode(sent_input_ids[1:-1])
+		aligned_preds = _merge_tokens_preds_sent( word_ids[i], preds_from_model[i], tokens )
+		if trace:
+			print( '\n', i, sent_tokens )
+			print( sent_input_ids )
+			print( preds_from_model[i])
+			print( ' '.join( tokens ) )
+			print( "aligned_preds", aligned_preds )
+		for k, tok in enumerate( tokens ):
+			# Ignorer les tokens spéciaux
+			if tok.startswith('[LANG=') or tok.startswith('[FRAME='):
+				if trace:
+					print(f"Skip special token: {tok}")
+				continue
+			label = aligned_preds[k]
+			predictions.append( id2label[label] )
+	return predictions
+def _merge_tokens_preds_sent( word_ids, preds, tokens ):
+	'''
+	The tokenizer split the tokens into subtokens, with labels added on subwords.
+	For evaluation, we need to merge the subtokens, and keep only the labels on
+	the plain tokens.
+	The function takes the whole input_ids and predictions for one sentence and
+	return the merged version.
+	We also get rid of tokens and associated labels for [CLS] and [SEP] and don't
+	keep predictions for padding tokens.
+	TODO: here inspireed from the mthod to split the labels, but we can cut the
+	2 continue (kept for debug)
+	input_ids: list
+		list of ids of (sub)tokens as produced by the (BERT like) tokenizer
+	preds: list
+		the predictions of the model
+	'''
+	aligned_toks = []
+	count = 0
+	new_labels = []
+	current_word = None
+	for i, word_id in enumerate( word_ids ):
+		count += 1
+		if word_id != current_word:
+			# New word
+			current_word = word_id
+			if word_id is not None:
+				new_labels.append( preds[i] )
+				aligned_toks.append( tokens[word_id] )
+		elif word_id is None:
+			# Special token
+			continue
+		else:
+			# Same word as previous token
+			continue
+	if len(new_labels) != len(aligned_toks) or len(new_labels) != len(tokens):
+		print( "WARNING, something wrong, not the same nb of tokens and predictions")
+		print( len(new_labels), len(aligned_toks), len(tokens) )
+	return new_labels
+def map_labels_list( list_labels, id2label ):
+	return [id2label[l] for l in list_labels]
+def align_tokens_labels_from_subtokens( preds_from_model, dataset_eval, tokenizer, pred_file, trace=False ):
+	'''
+	Align tokens and labels (merging subtokens, assigning the right label)
+	based on the specific characters for starting a subtoken (e.g. ## for BERT)
+	and label -100 assigned to contractions of MWE (e.g. it's).
+	But not completely sure that we get the exact alignment with original words here.
+	'''
+	aligned_tokens, aligned_golds, aligned_preds = [], [], []
+	id2label = dataset_eval.id2label
+	tokenized_dataset = dataset_eval.tokenized_datasets
+	# print("\ndataset_eval.tokenized_datasets", dataset_eval.tokenized_datasets)
+	# print("preds_from_model.shape", preds_from_model.shape)
+	# For each sentence
+	with open(pred_file, 'w') as fout:
+		# Iterate on sentences
+		for i in range( preds_from_model.shape[0] ):
+			# fout.write( "new_sent_"+str(i)+'\n' )
+			sent_input_ids = dataset_eval.tokenized_datasets['input_ids'][i]
+			sent_gold_labels = tokenized_dataset['labels'][i]
+			sent_pred_labels = preds_from_model[i]
+			aligned_t, aligned_g, aligned_p = _retrieve_tokens_from_sent( sent_input_ids, sent_pred_labels,
+													 sent_gold_labels, tokenizer, trace=trace )
+			aligned_tokens.append(aligned_t)
+			aligned_golds.append( map_labels_list(aligned_g, id2label) )
+			aligned_preds.append( map_labels_list(aligned_p, id2label) )
+	return aligned_tokens, aligned_golds, aligned_preds
+def _retrieve_tokens_from_sent( sent_input_ids, preds_from_model, sent_gold_labels, tokenizer, trace=False ):
+	# tokenized_dataset = dataset.tokenized_datasets
+	cur_token, cur_pred, cur_gold = None, None, None
+	tokens, golds, preds = [], [], []
+	if trace:
+		print( '\n\nlen(sent_input_ids', len(sent_input_ids))
+		print( 'len(preds_from_model)', len(preds_from_model) ) #with padding
+		print( 'len(sent_gold_labels)', sent_gold_labels)
+	# Ignore first and last token / labels
+	for j, input_id in enumerate( sent_input_ids[1:-1] ):
+		gold_label = sent_gold_labels[j+1]
+		pred_label = preds_from_model[j+1]
+		subtoken = tokenizer.decode( input_id )
+		if trace:
+			print( subtoken, gold_label, pred_label )
+		# Deal with tokens split into subtokens, keep label of the first subtoken
+		if subtoken.startswith( SUBTOKEN_START ) or gold_label == -100:
+			if cur_token == None:
+				print( "WARNING: first subtoken without a token, probably a contraction or MWE")
+				cur_token=""
+			cur_token += subtoken
+		else:
+			if cur_token != None:
+				tokens.append( cur_token )
+				golds.append(cur_gold)
+				preds.append(cur_pred)
+			cur_token = subtoken
+			cur_pred = pred_label
+			cur_gold = gold_label
+	# add last one
+	tokens.append( cur_token )
+	golds.append(cur_gold)
+	preds.append(cur_pred)
+	if trace:
+		print( "\ntokens:", len(tokens), tokens )
+		print( "golds", len(golds), golds )
+		print( "preds", len(preds), preds )
+		for i, tok in enumerate(tokens):
+			print( tok, golds[i], preds[i])
+	return tokens, golds, preds
+def retrieve_predictions(model_checkpoint, dataset_eval, output_path, tokenizer, config):
+	"""
+	Load the trainer in eval mode and compute predictions
+	on dataset_eval (peut être un dataset HuggingFace OU une liste de phrases)
+	"""
+	import os, transformers
+	model_path = model_checkpoint
+	if os.path.isfile(model_checkpoint):
+		print(f"[INFO] Le chemin du modèle pointe vers un fichier, utilisation du dossier parent: {os.path.dirname(model_checkpoint)}")
+		model_path = os.path.dirname(model_checkpoint)
+	config_file = os.path.join(model_path, "config.json")
+	if not os.path.exists(config_file):
+		raise FileNotFoundError(f"Aucun fichier config.json trouvé dans {model_path}.")
+	# Load model
+	model = transformers.AutoModelForTokenClassification.from_pretrained(model_path)
+	# Collator
+	data_collator = transformers.DataCollatorForTokenClassification(
+		tokenizer=tokenizer,
+		padding=config["tok_config"]["padding"]
+	)
+	compute_metrics = utils.prepare_compute_metrics(
+		getattr(dataset_eval, "LABEL_NAMES_BIO", None) or []
+	)
+	# Mode eval
+	model.eval()
+	test_args = transformers.TrainingArguments(
+		output_dir=output_path,
+		do_train=False,
+		do_predict=True,
+		dataloader_drop_last=False,
+		report_to=config.get("report_to", "none"),
+	)
+	trainer = transformers.Trainer(
+		model=model,
+		args=test_args,
+		data_collator=data_collator,
+		compute_metrics=compute_metrics,
+	)
+	# Si dataset_eval est juste une liste de phrases → on fabrique un Dataset
+	from datasets import Dataset
+	if isinstance(dataset_eval, list):
+		dataset_eval = Dataset.from_dict({"text": dataset_eval})
+		def tokenize(batch):
+			return tokenizer(batch["text"], truncation=True, padding=True)
+		dataset_eval = dataset_eval.map(tokenize, batched=True)
+		predictions, label_ids, metrics = trainer.predict(dataset_eval)
+	else:
+		# - Make predictions on eval dataset
+		predictions, label_ids, metrics = trainer.predict(dataset_eval.tokenized_datasets)
+	return predictions, label_ids, metrics
+# --------------------------------------------------------------------------
+# --------------------------------------------------------------------------
+if __name__=="__main__":
+	import argparse, os
+	import shutil
+	path = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "datasets")
+	if os.path.exists(path):
+		shutil.rmtree(path)
+		print(f"Le dossier '{path}' a été supprimé.")
+	else:
+		print(f"Le dossier '{path}' n'existe pas.")
+	parser = argparse.ArgumentParser(
+		description='DISCUT: Discourse segmentation and connective detection'
+	)
+	# EVAL file
+	parser.add_argument("-t", "--test",
+		help="Eval file. Default: data_test/eng.sample.rstdt/eng.sample.rstdt_dev.conllu",
+		default="data_test/eng.sample.rstdt/eng.sample.rstdt_dev.conllu")
+	# PRE FINE-TUNED MODEL
+	parser.add_argument("-m", "--model",
+		help="path to the directory where is the Model file.",
+		default=None)
+	# OUTPUT DIRECTORY
+	parser.add_argument("-o", "--output",
+		help="Directory where models and pred will be saved. Default: /home/cbraud/experiments/expe_discut_2025/",
+		default="./data/temp_expe/")
+	# CONFIG FILE FROM THE FINE TUNED MODEL
+	parser.add_argument("-c", "--config",
+		help="Config file. Default: ./config_seg.json",
+		default="./config_seg.json")
+	# TRACE / VERBOSITY
+	parser.add_argument( '-v', '--trace',
+		action='store_true',
+		default=False,
+		help="Whether to print full messages. If used, it will override the value in config file.")
+	# TODO Add an option for choosing the tool to split the sentences
+	args = parser.parse_args()
+	eval_path = args.test
+	output_path = args.output
+	if not os.path.isdir( output_path ):
+		os.makedirs(output_path, exist_ok=True )
+	config_file = args.config
+	model = args.model
+	trace = args.trace
+	print( '\n-[DISCUT]--PROGRAM (eval) ARGUMENTS')
+	print( '| Mode', 'eval' )
+	if not model:
+		sys.exit( "Please provide a path to a model for eval mode.")
+	print( '| Test_path:', eval_path )
+	print( "| Output_path:", output_path )
+	if model:
+		print( "| Model:", model )
+	print( '| Config:', config_file )
+	print( '\n-[DISCUT]--CONFIG INFO')
+	config = utils.read_config( config_file )
+	utils.print_config( config )
+	print( "\n-[DISCUT]--READING DATASET")
+	###
+	datasets = {}
+	datasets['dev'], tokenizer = reading.read_dataset( eval_path, output_path, config )
+	# model also in config[best_model_path]
+	metrics=simple_eval( datasets['dev'], model, tokenizer, output_path, config, trace=trace )
+# # TODO clean, probably unused arguments here
+# def simple_eval_deprecated( dataset_eval, model_checkpoint, tokenizer, output_path,
+# 		config ):
+# 	'''
+# 	Run the pre-trained model on the (dev) dataset to get predictions,
+# 	then write the predictions in an output file.
+# 	Parameters:
+# 	-----------
+# 	datasets: dict of DatasetDisc
+# 		The datasets read
+# 	model_checkpoint: str
+# 		path to the saved model
+# 	tokenizer: Tokenizer
+# 		tokenizer of the saved model (TODO: retrieve from model? or should be removed?)
+# 	output_path: str
+# 		path to the output directory where prediction files will be written
+# 	data_collator: DataCollator
+# 		(TODO: retrieve from model?)
+# 	'''
+# 	# tokenized_dataset = dataset_eval.tokenized_datasets
+# 	dev_dataset = dataset_eval.dataset
+# 	LABEL_NAMES = dataset_eval.LABEL_NAMES_BIO
+# 	# TODO check if needed
+# 	word_ids = dataset_eval.all_word_ids
+# 	model = transformers.AutoModelForTokenClassification.from_pretrained(
+# 		model_checkpoint
+# 	)
+# 	data_collator = transformers.DataCollatorForTokenClassification(
+# 		    tokenizer=tokenizer,
+# 			padding=config["tok_config"]["padding"] )
+# 	compute_metrics = utils.prepare_compute_metrics(LABEL_NAMES)
+# 	# TODO is it useful to have both .eval() and test_args ?
+# 	model.eval()
+# 	test_args = transformers.TrainingArguments(
+# 		output_dir = output_path,
+# 		do_train = False,
+# 		do_predict = True,
+# 		#per_device_eval_batch_size = BATCH_SIZE,
+# 		dataloader_drop_last = False
+# 	)
+# 	trainer = transformers.Trainer(
+# 		model=model,
+# 		args=test_args,
+# 		data_collator=data_collator,
+# 		compute_metrics=compute_metrics,
+# )
+# 	predictions, label_ids, metrics = trainer.predict(dataset_eval.tokenized_datasets)
+# 	preds = np.argmax(predictions, axis=-1)
+# 	compute_metrics([predictions, label_ids])
+# 	# Try to write predictions: will fail if cache not emptied
+# 	# because we need word_ids not saved in cache TODO check...
+# 	pred_file = os.path.join( output_path, dataset_eval.basename+'.preds' )
+# 	try:
+# 		write_predictions_words( preds, dataset_eval.tokenized_datasets,
+# 							tokenizer, pred_file, dataset_eval.id2label,
+# 				            word_ids, dev_dataset, dataset_eval )
+# 	except IndexError:
+# 		# if error, we print the predictions with tokens as is
+# 		write_predictions_subtokens( preds, dataset_eval.tokenized_datasets,
+# 							tokenizer, pred_file, dataset_eval.id2label )
+# 	# Test DISRPT eval script
+# 	print( "\nPerformance computed using disrpt eval script on", dataset_eval.annotations_file,
+# 			pred_file )
+# 	if config['task'] == 'seg':
+# 		my_eval = disrpt_eval_2025.SegmentationEvaluation("temp_test_disrpt_eval_seg",
+# 			dataset_eval.annotations_file,
+# 			pred_file )
+# 	elif config['task'] == 'conn':
+# 		my_eval = disrpt_eval_2025.ConnectivesEvaluation("temp_test_disrpt_eval_conn",
+# 			dataset_eval.annotations_file,
+# 			pred_file )
+# 	else:
+# 		raise NotImplementedError
+# 	my_eval.compute_scores()
+# 	my_eval.print_results()
+# # TODO: dd????
+# # TODO : only for SEG/CONN --> to rename (and make a generic function)
+# def write_predictions_words_deprecated( preds, dev, tokenizer, pred_file, id2label,  word_ids,
+# 		dev_dataset, dd, trace=False ):
+# 	'''
+# 	Write predictions for segmentation or connective tasks in an output files.
+# 	The output is the same as the input gold file, with an additional column
+# 	corresponding to the predicted label.
+# 	?? We need the word_ids information to merge the words that been split et
+# 	retrieve the original tokens from the input .tok / .conllu files and run
+# 	evaluation.
+# 	Parameters:
+# 	-----------
+# 	preds: list of int
+# 		The predicted labels (numeric ids)
+# 	dev: Dataset
+# 		tokenized_dev
+# 	pred_file: str
+# 		Path to the file where predictions will be written
+# 	id2label: dict
+# 		Convert from ids to labels
+# 	word_ids: list?
+# 		Word ids, None for task rel
+# 	dev_dataset : Dataset
+# 		Dataset for the dev set
+# 	dd : str?
+# 		dset
+# 	'''
+# 	predictions = []
+# 	for i in range( preds.shape[0] ):
+# 		sent_input_ids = dev['input_ids'][i]
+# 		tokens = dev_dataset['tokens'][i]
+# 		# sentence text
+# 		sent_tokens = tokenizer.decode(sent_input_ids[1:-1])
+# 		# list of decoded subtokens
+# 		#sub_tokens = [tokenizer.decode(tok_id) for tok_id in sent_input_ids]
+# 		# Merge subtokens and retrieve corresp. pred labels
+# 		# i.e. we ignore: CLS, SEP, PAD and labels on ##subtoks
+# 		aligned_preds = merge_tokens_preds_sent( word_ids[i], preds[i], tokens )
+# 		if trace:
+# 			print( '\n', i, sent_tokens )
+# 			print( sent_input_ids )
+# 			print( preds[i])
+# 			print( ' '.join( tokens ) )
+# 			print( "aligned_preds", aligned_preds )
+# 		# sentence id, but TODO: retrieve doc ids
+# 		#f.write( "# sent_id = "+str(i)+"\n" )
+# 		# Write the original sentence text
+# 		#f.write( "# text = "+sent_tokens+"\n" )
+# 		# indices should start at 1
+# 		for k, tok in enumerate( tokens ):
+# 			label = aligned_preds[k]
+# 			predictions.append( id2label[label] )
+# 			#f.write( "\t".join( [str(k+1), tok, "_","_","_","_","_","_","_", id2label[label] ] )+"\n" )
+# 		#f.write("\n")
+# 		print("PREDICTIONS", predictions)
+# 	count_pred_B, count_gold_B = 0, 0
+# 	with open( dd.annotations_file, 'r' ) as fin:
+# 		with open( pred_file, 'w' ) as fout:
+# 			mylines = fin.readlines()
+# 			count = 0
+# 			if trace:
+# 				print("len(predictions)", len(predictions))
+# 			for l in mylines:
+# 				l = l.strip()
+# 				if l.startswith("#"):
+# 					fout.write( l+'\n')
+# 				elif l == '' or l == '\n':
+# 					fout.write('\n')
+# 				elif '-' in l.split('\t')[0]:
+# 					fout.write( l+'\t'+'_'+'\n')
+# 				else:
+# 					if 'B' in predictions[count]:
+# 						count_pred_B += 1
+# 					if 'Seg=B-seg' in l or 'Conn=B-conn' in l:
+# 						count_gold_B += 1
+# 					fout.write( l+'\t'+predictions[count]+'\n')
+# 					count += 1
+# 			print("Count the number of predictions corresponding to a B", count_pred_B, "vs Gold B", count_gold_B)
+# # TODO: dd????
+# # TODO : only for SEG/CONN --> to rename (and make a generic function)
+# def write_predictions_words( preds_from_model, dataset_eval, tokenizer, pred_file, trace=True ):
+# 	'''
+# 	Write predictions for segmentation or connective tasks in an output files.
+# 	The output is the same as the input gold file, with an additional column
+# 	corresponding to the predicted label.
+# 	?? We need the word_ids information to merge the words that been split et
+# 	retrieve the original tokens from the input .tok / .conllu files and run
+# 	evaluation.
+# 	Parameters:
+# 	-----------
+# 	preds_from_model: list of int
+# 		The predicted labels (numeric ids)
+# 	dev: Dataset
+# 		tokenized_dev
+# 	pred_file: str
+# 		Path to the file where predictions will be written
+# 	id2label: dict
+# 		Convert from ids to labels
+# 	word_ids: list?
+# 		Word ids, None for task rel
+# 	dev_dataset : Dataset
+# 		Dataset for the dev set
+# 	dd : str?
+# 		dset
+# 	'''
+# 	word_ids = dataset_eval.all_word_ids
+# 	id2label = dataset_eval.id2label
+# 	predictions = []
+# 	for i in range( preds_from_model.shape[0] ):
+# 		sent_input_ids = dataset_eval.tokenized_datasets['input_ids'][i]
+# 		tokens = dataset_eval.dataset['tokens'][i]
+# 		# sentence text
+# 		sent_tokens = tokenizer.decode(sent_input_ids[1:-1])
+# 		# list of decoded subtokens
+# 		#sub_tokens = [tokenizer.decode(tok_id) for tok_id in sent_input_ids]
+# 		# Merge subtokens and retrieve corresp. pred labels
+# 		# i.e. we ignore: CLS, SEP, PAD and labels on ##subtoks
+# 		aligned_preds = merge_tokens_preds_sent( word_ids[i], preds_from_model[i], tokens )
+# 		if trace:
+# 			print( '\n', i, sent_tokens )
+# 			print( sent_input_ids )
+# 			print( preds_from_model[i])
+# 			print( ' '.join( tokens ) )
+# 			print( "aligned_preds", aligned_preds )
+# 		# sentence id, but TODO: retrieve doc ids
+# 		#f.write( "# sent_id = "+str(i)+"\n" )
+# 		# Write the original sentence text
+# 		#f.write( "# text = "+sent_tokens+"\n" )
+# 		# indices should start at 1
+# 		for k, tok in enumerate( tokens ):
+# 			label = aligned_preds[k]
+# 			predictions.append( id2label[label] )
+# 			#f.write( "\t".join( [str(k+1), tok, "_","_","_","_","_","_","_", id2label[label] ] )+"\n" )
+# 		#f.write("\n")
+# 		# print("PREDICTIONS", predictions)
+# 	write_pred_file( dataset_eval.annotations_file, pred_file, predictions )

pipeline.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from transformers import Pipeline, AutoModelForTokenClassification
+import numpy as np
+from eval import retrieve_predictions, align_tokens_labels_from_wordids
+from reading import read_dataset
+from utils import read_config
+def write_sentences_to_format(sentences: list[str], filename: str):
+    """
+    Écrit une phrase dans un fichier, un mot par ligne, avec le format :
+    index<TAB>mot<TAB>_<TAB>_<TAB>_<TAB>_<TAB>_<TAB>_<TAB>_<TAB>Seg=...
+    """
+    if not sentences:
+        return ""
+    if isinstance(sentences, str):
+        sentences=[sentences]
+        import sys
+        sys.stderr.write("Warning: only one sentence provided as a string instead of a list of sentences.\n")
+    full="# newdoc_id = GUM_academic_discrimination\n"
+    for sentence in sentences:
+        words = sentence.strip().split()
+        for i, word in enumerate(words, start=1):
+            # Le premier mot → B-seg, sinon O
+            seg_label = "B-seg" if i == 1 or word[0].isupper() else "O"
+            line = f"{i}\t{word}\t_\t_\t_\t_\t_\t_\t_\tSeg={seg_label}\n"
+            full+=line
+    if filename:
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(full)
+    return full
+class DiscoursePipeline(Pipeline):
+    def __init__(self, model, tokenizer, config:dict, output_folder="./pipe_out",sat_model:str="sat-3l", **kwargs):
+        auto_model = AutoModelForTokenClassification.from_pretrained(model)
+        super().__init__(model=auto_model, tokenizer=tokenizer, **kwargs)
+        self.config = {"model_checkpoint": model, "sent_spliter":"sat","task":"seg","type":"tok","trace":False,"report_to":"none","sat_model":sat_model,"tok_config":{
+        "padding":"max_length",
+        "truncation":True,
+        "max_length": 512
+    }}
+        self.model = model
+        self.output_folder = output_folder
+    def _sanitize_parameters(self, **kwargs):
+        # Permet de passer des paramètres optionnels comme add_lang_token etc.
+        preprocess_params = {}
+        forward_params = {}
+        postprocess_params = {}
+        return preprocess_params, forward_params, postprocess_params
+    def preprocess(self, text:str):
+        self.original_text=text
+        formatted_text=write_sentences_to_format(text.split("\n"), filename=None)
+        dataset, _ = read_dataset(
+            formatted_text,
+            output_path=self.output_folder,
+            config=self.config,
+            add_lang_token=True,
+            add_frame_token=True,
+        )
+        return {"dataset": dataset}
+    def _forward(self, inputs):
+        dataset = inputs["dataset"]
+        preds_from_model, label_ids, _ = retrieve_predictions(
+            self.model, dataset, self.output_folder, self.tokenizer, self.config
+        )
+        return {"preds": preds_from_model, "labels": label_ids, "dataset": dataset}
+    def postprocess(self, outputs):
+        preds = np.argmax(outputs["preds"], axis=-1)
+        predictions = align_tokens_labels_from_wordids(preds, outputs["dataset"], self.tokenizer)
+        edus=text_to_edus(self.original_text, predictions)
+        return edus
+def get_plain_text_from_format(formatted_text:str) -> str:
+    """
+    Lit un fichier conllu ou tok et retourne son contenu sous forme de chaîne de caractères.
+    """
+    formatted_text=formatted_text.split("\n")
+    s=""
+    for line in formatted_text:
+        if not line.startswith("#"):
+            if len(line.split("\t"))>1:
+                s+=line.split("\t")[1]+" "
+    return s.strip()
+def get_preds_from_format(formatted_text:str) -> str:
+    """
+    Lit un fichier conllu ou tok et retourne son contenu sous forme de chaîne de caractères.
+    """
+    formatted_text=formatted_text.split("\n")
+    s=""
+    for line in formatted_text:
+        if not line.startswith("#"):
+            if len(line.split("\t"))>1:
+                s+=line.split("\t")[-1]+" "
+    return s.strip()
+def text_to_edus(text: str, labels: list[str]) -> list[str]:
+    """
+    Découpe un texte brut en EDUs à partir d'une séquence de labels BIO.
+    Args:
+        text (str): Le texte brut (séquence de mots séparés par des espaces).
+        labels (list[str]): La séquence de labels BIO (B, I, O),
+                            de même longueur que le nombre de tokens du texte.
+    Returns:
+        list[str]: La liste des EDUs (chaque EDU est une sous-chaîne du texte).
+    """
+    words = text.strip().split()
+    if len(words) != len(labels):
+        raise ValueError(f"Longueur mismatch: {len(words)} mots vs {len(labels)} labels")
+    edus = []
+    current_edu = []
+    for word, label in zip(words, labels):
+        if label == "Conn=O" or label == "Seg=O":
+            current_edu.append(word)
+        elif label == "Conn=B-conn" or label == "Seg=B-seg":
+            # Finir l'EDU courant si ouvert
+            if current_edu:
+                edus.append(" ".join(current_edu))
+                current_edu = []
+            current_edu.append(word)
+    # Si un EDU est resté ouvert, on le ferme
+    if current_edu:
+        edus.append(" ".join(current_edu))
+    return edus

reading.py ADDED Viewed

	@@ -0,0 +1,512 @@

+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import os, sys
+import datasets
+import transformers
+import disrpt_io
+import utils
+# TODO to rm when dealt with this issue of loading languages
+##from ersatz import utils
+##LANGUAGES = utils.MODELS.keys()
+LANGUAGES = []
+def read_dataset( input_path, output_path, config, add_lang_token=True,add_frame_token=True,lang_token="",frame_token="" ):
+	'''
+	- Read the file in input_path
+	- Return a Dataset corresponding to the file
+	Parameters
+	----------
+	input_path : str
+		Path to the dataset
+	output_path : str
+		Path to an output directory that can be used to write new split files
+	tokenizer : AutoTokenizer
+		Tokenizer corresponding the checkpoint model
+	add_lang_token : bool
+		If True, add a special language token at the beginning of each sequence
+	Returns
+	-------
+	Dataset
+		Contain Dataset built from train_path and dev_path for train mode,
+			only dev / test pasth else
+	Tokenizer
+		The tokenizer used for the dataset
+	'''
+	model_checkpoint = config["model_checkpoint"]
+	# -- Init tokenizer
+	tokenizer = transformers.AutoTokenizer.from_pretrained( model_checkpoint )
+	# -- Read and tokenize
+	dataset = DatasetSeq( input_path, output_path, config, tokenizer, add_lang_token=add_lang_token,add_frame_token=add_frame_token,lang_token=lang_token,frame_token=frame_token )
+	dataset.read_and_tokenize()
+	# TODO move in class? or do elsewhere
+	LABEL_NAMES_BIO = retrieve_bio_labels( dataset ) # TODO should do it only once for all
+	dataset.set_label_names_bio(LABEL_NAMES_BIO)
+	return dataset, tokenizer
+# --------------------------------------------------------------------------
+# DatasetDict
+class DatasetDisc( ):
+	def __init__(self, annotations_file, output_path, config, tokenizer, dset=None ):
+		"""
+		Here we save the location of our input file,
+		load the data, i.e. retrieve the list of texts and associated labels,
+		build the vocabulary if none is given,
+		and define the pipelines used to prepare the data
+		"""
+		self.annotations_file = annotations_file
+		if isinstance(annotations_file, str) and not os.path.isfile(annotations_file):
+			print("this is a string dataset")
+			self.basename = "input"
+		else:
+			self.basename = os.path.basename( self.annotations_file )
+			self.dset = self.basename.split(".")[2].split('_')[1]
+			self.corpus_name = self.basename.split('_')[0]
+		self.tokenizer = tokenizer
+		self.config = config
+		# If a sentence splitter is used, the files with the new segmentation will be saved here
+		self.output_path = output_path
+		# Retriev info from config: TODO check against info from dir name?
+		self.mode = config["type"]
+		self.task = config["task"]
+		self.trace = config["trace"]
+		self.tok_config = config["tok_config"]
+		self.sent_spliter = config["sent_spliter"]
+		# Additional fields
+		self.id2label, self.label2id = {}, {}
+		# -- Use disrpt_io to read the file and retrieve annotated data
+		self.corpus = init_corpus( self.task ) # initialize a Corpus instance, depending on the task
+	def read_and_tokenize( self ):
+		print("\n-- READ FROM FILE:", self.annotations_file )
+		try:
+			self.read_annotations( )
+		except Exception as err:
+			print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
+			raise
+		# 	print( "Problem when reading", self.annotations_file )
+		#print("\n-- SET LABELS")
+		self.set_labels( )
+		print( "self.label2id", self.label2id )
+		#print("\n-- TOKENIZE DATASET")
+		self.tokenize_dataset()
+		if self.trace:
+			if self.dset:
+				print( "\n-- FINISHED READING", self.dset, "PRINTING TRACE --")
+				self.print_trace()
+	def tokenize_datasets( self ):
+		# Specific to subclasses
+		raise NotImplementedError
+	def set_labels( self ):
+		# Specific to subclasses
+		raise NotImplementedError
+	# outside the class?
+	# TODO use **kwags instead?
+	def read_annotations( self ):
+		'''
+		Generate a Corpus object based on the input_file.
+		Since .tok files are not segmented into sentences, a sentence splitter
+		is used (here, ersatz)
+		'''
+		if os.path.isfile(self.annotations_file):
+			self.corpus.from_file(self.annotations_file)
+			lang = os.path.basename(self.annotations_file).split(".")[0]
+			frame = os.path.basename(self.annotations_file).split(".")[1]
+			base = os.path.basename(self.annotations_file)
+		else:
+			# on suppose que c’est du texte brut déjà au format attendu
+			src = self.mode if self.mode in ["tok", "conllu", "split"] else "conllu"
+			self.corpus.from_string(self.annotations_file,src=src)
+			lang = self.lang_token
+			frame = self.frame_token
+			base = "input.text"
+		#print(f"[DEBUG] lang? {lang}")
+		for doc in self.corpus.docs:
+			doc.lang = lang
+			doc.frame = frame
+		# print(corpus)
+		# Split corpus into sentences using Ersatz
+		if self.mode == 'tok':
+			kwargs={}
+			from wtpsplit import SaT
+			sat_version="sat-3l"
+			if "sat_model" in self.config:
+				sat_version=self.config["sat_model"]
+			sat_model = SaT(sat_version)
+			kwargs["sat_model"] = sat_model
+			self.corpus.sentence_split(model = self.sent_spliter, lang="default-multilingual",sat_model=sat_model)
+			# Writing files with the split sentences
+			parts = base.split(".")[:-1]
+			split_filename = ".".join(parts) + ".split"
+			split_file = os.path.join(self.output_path, split_filename)
+			self.corpus.format(file=split_file)
+		# no need for sentence splitting if mode = conllu or split, no need to write files
+	def print_trace( self ):
+		print( "\n| Annotation_file: ", self.annotations_file )
+		print( '| Output_path:', self.output_path )
+		print( '| Nb_of_instances:', len(self.dataset), "(", len(self.dataset['labels']), ")" )
+	 	#		"(", len(self.dataset['tokens']), len(self.dataset['labels']), ")" )
+	def print_stats( self ):
+		print( "| Annotation_file: ", self.annotations_file )
+		if self.dset: print( "| Data_split: ", self.dset )
+		print( "| Task: ", self.task )
+		print( "| Lang: ", self.lang )
+		print( "| Mode: ", self.mode )
+		print( "| Label_names: ", self.LABEL_NAMES)
+		#print( "---Number_of_documents", len( self.corpus.docs ) )
+		print( "| Number_of_instances: ", len(self.dataset) )
+		# TODO : add number of docs: not computed for .rels for now
+# -------------------------------------------------------------------------------------------------
+class DatasetSeq(DatasetDisc):
+	def __init__( self, annotations_file, output_path, config, tokenizer, add_lang_token=True, add_frame_token=True,
+			dset=None,lang_token="",frame_token="" ):
+		"""
+		Class for tasks corresponding to sequence labeling problem
+			(seg, conn).
+		Here we save the location of our input file,
+		load the data, i.e. retrieve the list of texts and associated
+			labels,
+		build the vocabulary if none is given,
+		and define the pipelines used to prepare the data """
+		DatasetDisc.__init__( self, annotations_file, output_path, config,
+			tokenizer )
+		self.add_lang_token = add_lang_token
+		self.add_frame_token=add_frame_token
+		self.lang_token = lang_token
+		self.frame_token=frame_token
+		if self.mode == 'tok' and self.output_path == None:
+			self.output_path = os.path.dirname( self.annotations_file )
+			self.output_path = os.path.join( self.output_path,
+				self.basename.replace("."+self.mode, ".split") )
+		self.sent_spliter = None
+		if "sent_spliter" in self.config:
+			self.sent_spliter = self.config["sent_spliter"] #only for seg
+		self.LABEL_NAMES_BIO = None
+		# # TODO not used, really a good idea?
+		# self.data_collator = transformers.DataCollatorForTokenClassification(tokenizer=self.tokenizer,
+		# 	padding=self.tok_config["padding"] )
+	def tokenize_dataset( self ):
+		# -- Create a HuggingFace Dataset object
+		if self.trace:
+			print(f"\n-- Creating dataset from generator (add_lang_token={self.add_lang_token})")
+		self.dataset = datasets.Dataset.from_generator(
+			gen,
+			gen_kwargs={"corpus": self.corpus, "label2id": self.label2id, "mode": self.mode, "add_lang_token": self.add_lang_token,"add_frame_token":self.add_frame_token},
+		)
+		if self.trace:
+			print( self.dataset[0])
+		# Keep track of the alignement between words ans subtokens, even if not ##
+		# BERT* add a tokenisation based on punctuation even if given with a list of words
+		self.all_word_ids = []
+		# Align labels according to tokenized subwords
+		if self.trace:
+			print( "\n-- Mapping dataset labels and subwords ")
+		self.tokenized_datasets = self.dataset.map(
+			tokenize_and_align_labels,
+			fn_kwargs = {"tokenizer":self.tokenizer,
+							"id2label":self.id2label,
+							"label2id":self.label2id,
+							"all_word_ids":self.all_word_ids,
+							"config":self.config},
+			batched=True,
+			remove_columns=self.dataset.column_names,
+			)
+		if self.trace:
+			print( self.tokenized_datasets[0])
+	def set_labels(self):
+		self.LABEL_NAMES = self.corpus.LABELS
+		self.id2label = {i: label for i, label in enumerate( self.LABEL_NAMES )}
+		self.label2id = {v: k for k,v in self.id2label.items()}
+	def set_label_names_bio( self, LABEL_NAMES_BIO ):
+		self.LABEL_NAMES_BIO = LABEL_NAMES_BIO
+	def print_trace( self ):
+		super().print_trace()
+		print( '\n--First sentence: original tokens and labels.\n')
+		print( self.dataset[0]['tokens'] )
+		print( self.dataset[0]['labels'] )
+		print( "\n---First sentence: tokenized version:\n")
+		print( self.tokenized_datasets[0] )
+		# print( '\nSource word ids:', len(self.all_word_ids) )
+	# # TODO prepaper a compute_stats before printing, to allow partial printing without trace mode
+	# def print_stats( self ):
+	# 	super().print_stats()
+	# 	print( "| Number_of_documents", len( self.corpus.docs ) )
+def init_corpus( task ):
+	if task.strip().lower() == 'conn':
+		return disrpt_io.ConnectiveCorpus()
+	elif task == 'seg':
+		return disrpt_io.SegmentCorpus()
+	else:
+		raise NotImplementedError
+def gen( corpus, label2id, mode, add_lang_token=True,add_frame_token=True ):
+	# Ajout d'un token spécial langue au début de chaque séquence
+	source = "split"
+	if mode == 'conllu':
+		source = "conllu"
+	for doc in corpus.docs:
+		lang = getattr(doc, 'lang', 'xx') if hasattr(doc, 'lang') else 'xx'
+		lang_token = f"[LANG={lang}]"
+		frame = getattr(doc, 'frame', 'xx') if hasattr(doc, 'lang') else 'xx'
+		frame_token = f"[FRAME={frame}]"
+		sent_list = doc.sentences[source] if source in doc.sentences else doc.sentences
+		for sentence in sent_list:
+			labels = []
+			tokens = []
+			if add_lang_token:
+				tokens.append(lang_token)
+				labels.append(-100)
+			if add_frame_token:
+				tokens.append(frame_token)
+				labels.append(-100)
+				#print(f"[DEBUG] Ajout du token frame {frame_token} pour la phrase: {' '.join([t.form for t in sentence.toks])}")
+			for t in sentence.toks:
+				tokens.append(t.form)
+				if t.label == '_':
+					if 'O' in label2id:
+						labels.append(label2id['O'])
+					else:
+						labels.append(list(label2id.values())[0])
+				else:
+					labels.append(label2id[t.label])
+			yield {
+				"tokens": tokens,
+				"labels": labels
+			}
+def get_tokenizer( model_checkpoint ):
+	return transformers.AutoTokenizer.from_pretrained(model_checkpoint)
+def tokenize_and_align_labels( dataset, tokenizer, id2label, label2id, all_word_ids, config ):
+	'''
+	(Done in batches)
+	To preprocess our whole dataset, we need to tokenize all the inputs and
+	apply align_labels_with_tokens() on all the labels.
+	(with HG, we could use Dataset.map to process batches)
+	The word_ids() function needs to get the index of the example we want
+	the word IDs of when the inputs to the tokenizer are lists of texts
+	(or in our case, list of lists of words), so we add that too:
+	"tok_config"
+	'''
+	tokenized_inputs = tokenizer(
+		dataset["tokens"],
+		truncation=config["tok_config"]['truncation'],
+		padding=config["tok_config"]['padding'],
+		max_length=config["tok_config"]['max_length'],
+		is_split_into_words=True
+	)
+	# tokenized_inputs = tokenizer(
+	# 	dataset["tokens"], truncation=True, padding=True, is_split_into_words=True
+	# )
+	all_labels = dataset["labels"]
+	new_labels = []
+	#print( "tokenized_inputs.word_ids()", tokenized_inputs.word_ids() )
+	#print( [tokenizer.decode(tok) for tok in tokenized_inputs['input_ids']])
+	##with progressbar.ProgressBar(max_value=len(all_labels)) as bar:
+	##for i in tqdm(range(len(all_labels))):
+	for i, labels in enumerate(all_labels):
+		word_ids = tokenized_inputs.word_ids(i)
+		new_labels.append(align_labels_with_tokens(labels, word_ids, id2label, label2id, tokenizer, tokenized_inputs ))
+		# Used to fill the self.word_ids field of the Dataset object, but should probably be done some<here else
+		all_word_ids.append( word_ids )
+		##bar.update(i)
+	tokenized_inputs["labels"] = new_labels
+	return tokenized_inputs
+def align_labels_with_tokens(labels, word_ids, id2label, label2id, tokenizer, tokenized_inputs):
+	'''
+	BERT like tokenization will create new tokens, we need to align labels.
+	Special tokens get a label of -100. This is because by default -100 is an
+	index that is ignored in the loss function we will use (cross entropy).
+	Then, each token gets the same label as the token that started the word
+	it’s inside, since they are part of the same entity. For tokens inside a
+	word but not at the beginning, we replace the B- with I- (since the token
+	does not begin the entity). [Taken from HF website course on NER]
+	'''
+	count = 0
+	new_labels = []
+	current_word = None
+	for word_id in word_ids:
+		count += 1
+		if word_id==0: # ou 1 peut etre
+			#TODO
+			#add lang token -100
+			pass
+		if word_id != current_word:
+			# Start of a new word!
+			current_word = word_id
+			label = -100 if word_id is None else labels[word_id]
+			new_labels.append(label)
+		elif word_id is None:
+			# Special token
+			new_labels.append(-100)
+		else:
+			# Same word as previous token
+			label = labels[word_id]
+			# On ne cherche 'B-' que si label != -100
+			if label != -100 and 'B-' in id2label[label]:
+				label = -100
+			new_labels.append(label)
+	return new_labels
+def retrieve_bio_labels( dataset ):
+	'''
+	Needed for compute_metrics, I think? It seems to be using a classic metrics for BIO
+	scheme, thus we create a mapping to BIO labels, i.e.:
+	'_' --> 'O'
+	'Seg=B-Conn' --> 'B'
+	'Seg=I-Conn' --> 'I'
+	Should also work for segmentation TODO: check
+	datasets: dict: DatasetSeq instances for train/dev/test
+	Return: list: original label names
+			list: label names mapped to BIO
+	'''
+	# need a Dataset instance to retrieve the original label sets
+	task = dataset.task
+	LABEL_NAMES_BIO = []
+	LABEL_NAMES = dataset.LABEL_NAMES
+	label2idx, idx2newl = {}, {}
+	if task in ["conn", "seg"]:
+		for i,l in enumerate( LABEL_NAMES ):
+			label2idx[l] = i
+	for l in label2idx:
+		nl = ''
+		if 'B' in l:
+			nl = 'B'
+		elif 'I' in l:
+			nl = 'I'
+		else:
+			nl = 'O'
+		idx2newl[label2idx[l]] = nl
+	for i in sorted(idx2newl):
+		LABEL_NAMES_BIO.append(idx2newl[i])
+		#label_names = ['O', 'B', 'I']
+	return LABEL_NAMES_BIO
+# def _compute_distrib( dataset, id2label ):
+# 	distrib = {}
+# 	multi = []
+# 	for inst in dataset:
+# 		label = id2label[inst['label']]
+# 		if label in distrib:
+# 			distrib[label] += 1
+# 		else:
+# 			distrib[label] = 1
+# 		len_labels = len( inst["all_labels"])
+# 		if len_labels > 1:
+# 			#count_multi += 1
+# 			multi.append( len_labels )
+# 	return distrib, multi
+# Defines the language code for the sentence spliter, should be done in disrpt_io?
+def set_language( lang ):
+	#lang = "default-multilingual" #default value
+	# patch
+	if lang=="sp": lang="es"
+	if lang not in LANGUAGES:
+		lang = "default-multilingual"
+	return lang
+# ------------------------------------------------------------------
+if __name__=="__main__":
+	import argparse, os
+	parser = argparse.ArgumentParser(
+		description='DISCUT: reading data from disrpt_io and converting to HuggingFace'
+	)
+	# TRAIN AND DEV are (list of) FILES or DIRECTORIES
+	parser.add_argument("-t", "--train",
+		help="Training file. Default: data_test/eng.sample.rstdt/eng.sample.rstdt_train.conllu",
+		default="data_test/eng.sample.rstdt/eng.sample.rstdt_train.conllu")
+	parser.add_argument("-d", "--dev",
+		help="Dev file. Default: data/eng.sample.rstdt/eng.sample.rstdt_dev.conllu",
+		default="data_test/eng.sample.rstdt/eng.sample.rstdt_dev.conllu")
+	# OUTPUT DIRECTORY
+	parser.add_argument("-o", "--output",
+		help="Directory where models and pred will be saved. Default: /home/cbraud/experiments/expe_discut_2025/",
+		default="")
+	# CONFIG FILE
+	parser.add_argument("-c", "--config",
+		help="Config file. Default: ./config_seg.json",
+		default="./config_seg.json")
+	# TRACE / VERBOSITY
+	parser.add_argument( '-v', '--trace',
+		action='store_true',
+		default=False,
+		help="Whether to print full messages. If used, it will override the value in config file.")
+	args = parser.parse_args()
+	train_path = args.train
+	dev_path = args.dev
+	print(dev_path)
+	if not os.path.isfile(dev_path[0]):
+		print( "ERROR with dev file:", dev_path)
+	output_path = args.output
+	config_file = args.config
+	#eval = args.eval
+	trace = args.trace
+	print( '\n-[JEDIS]--PROGRAM (reader) ARGUMENTS')
+	print( '| Train_path', train_path )
+	print( '| Dev_path', dev_path )
+	print( "| Output_path", output_path )
+	print( '| Config', config_file )
+	print( '\n-[JEDIS]--CONFIG INFO')
+	config = utils.read_config( config_file )
+	utils.print_config(config)
+	# WE override the config file if the user says no trace in arguments
+	# easier than modifying the config files each time
+	if not trace:
+		config['trace'] = False
+	print( "\n-[JEDIS]--READING DATASETS" )
+	# dictionnary containing train (if model=='train') and/or dev (test) Dataset instance
+	datasets, tokenizer = read_dataset( train_path, dev_path, config, add_lang_token=True )

utils.py ADDED Viewed

	@@ -0,0 +1,216 @@

+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+import os, sys
+import json
+import numpy as np
+from pathlib import Path
+import itertools
+import evaluate
+import disrpt_eval_2025
+#from .disrpt_eval_2025 import *
+# TODO : should be conditioned on the task or the metric indicated in the config file ??
+def prepare_compute_metrics(LABEL_NAMES):
+	'''
+	Return the method to be used in the trainer loop.
+	For seg or conn, based on seqeval, and here ignore tokens with label
+		-100 (okay ?)
+	Parameters :
+	------------
+	LABEL_NAMES: Dict
+		Needed only for BIO labels, convert to the right labels for seqeval
+	task: str
+		Should be either 'seg', 'conn', but could be expanded to other
+			sequence / classif tasks
+	Returns :
+	---------
+	compute_metrics: function
+	'''
+	def compute_metrics(eval_preds):
+		nonlocal LABEL_NAMES
+		# nonlocal task
+		# Retrieve gold and predictions
+		logits, labels = eval_preds
+		predictions = np.argmax(logits, axis=-1)
+		metric = evaluate.load("seqeval")
+		# Remove ignored index (special tokens) and convert to labels
+		true_labels = [[LABEL_NAMES[l] for l in label if l != -100] for label in labels]
+		true_predictions = [
+				[LABEL_NAMES[p] for (p, l) in zip(prediction, label) if l != -100]
+					for prediction, label in zip(predictions, labels)
+				]
+		all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
+		print_metrics( all_metrics )
+		return {
+				"precision": all_metrics["overall_precision"],
+				"recall": all_metrics["overall_recall"],
+				"f1": all_metrics["overall_f1"],
+				"accuracy": all_metrics["overall_accuracy"],
+				}
+	return compute_metrics
+def print_metrics( all_metrics ):
+	#print( all_metrics )
+	for p,v in all_metrics.items():
+		if '_' in p:
+			print( p, v )
+		else:
+			print( p+' = '+str(v))
+def compute_metrics_dirspt( dataset_eval, pred_file, task='seg' ):
+	print( "\nPerformance computed using disrpt eval script on", dataset_eval.annotations_file,
+			pred_file )
+	if task == 'seg':
+		#clean_pred_file(pred_file, os.path.basename(pred_file)+"cleaned.preds")
+		my_eval = disrpt_eval_2025.SegmentationEvaluation("temp_test_disrpt_eval_seg",
+			dataset_eval.annotations_file,
+			pred_file )
+	elif task == 'conn':
+		my_eval = disrpt_eval_2025.ConnectivesEvaluation("temp_test_disrpt_eval_conn",
+			dataset_eval.annotations_file,
+			pred_file )
+	else:
+		raise NotImplementedError
+	my_eval.compute_scores()
+	my_eval.print_results()
+def clean_pred_file(pred_path: str, out_path: str):
+	c=0
+	with open(pred_path, "r", encoding="utf8") as fin, open(out_path, "w", encoding="utf8") as fout:
+		for line in fin:
+			if line.strip() == "" or line.startswith("#"):
+				fout.write(line)
+				continue
+			fields = line.strip().split("\t")
+			token = fields[1]
+			if token.startswith("[LANG=") or token.startswith("[FRAME="):
+				c+=1
+				continue  # skip meta-tokens
+			fout.write(line)
+	print(f"we've cleaned {c} tokens")
+# -------------------------------------------------------------------------------------------------
+# ------ UTILS FUNCTIONS
+# -------------------------------------------------------------------------------------------------
+def read_config( config_file ):
+	'''Read the config file for training'''
+	f = open(config_file)
+	config = json.load(f)
+	if 'frozen' in config['trainer_config']:
+		config['trainer_config']["frozen"] = update_frozen_set( config['trainer_config']["frozen"] )
+	return config
+def update_frozen_set( freeze ):
+	# MAke a set from the list of frozen layers
+	# [] --> nothing frozen
+	# [3] --> only layer 3 frozen
+	# [0,3] --> only layers 0 and 3
+	# [0-3, 12, 15] --> layers 0 to 3 included, + layers 12 and layers 15
+	frozen = set()
+	for spec in freeze:
+		if "-" in spec: # eg 1-9
+			b, e = spec.split("-")
+			frozen = frozen | set(range(int(b),int(e)+1))
+		else:
+			frozen.add(int(spec))
+	return frozen
+def print_config(config):
+	'''Print info from config dictionary'''
+	print('\n'.join([ '| '+k+": "+str(v) for (k,v) in config.items() ]))
+# -------------------------------------------------------------------------------------------------
+def retrieve_files_dataset( input_path, list_dataset, mode='conllu', dset='train' ):
+	if mode == 'conllu':
+		pat = ".[cC][oO][nN][lL][lL][uU]"
+	elif mode == 'tok':
+		pat = ".[tT][oO][kK]"
+	else:
+		sys.exit('Unknown mode for file extension: '+mode)
+	if len(list_dataset) == 0:
+		return list(Path(input_path).rglob("*_"+dset+pat))
+	else:
+		# files eng.pdtb.pdtb_train.conllu
+		matched = []
+		for subdir in os.listdir( input_path ):
+			if subdir in list_dataset:
+				matched.extend( list(Path(os.path.join(input_path,subdir)).rglob("*_"+dset+pat)) )
+		return matched
+# -------------------------------------------------------------------------------------------------
+# https://wandb.ai/site
+def init_wandb( config, model_checkpoint, annotations_file ):
+	'''
+	Initialize a new WANDB project to keep track of the experiments.
+	Parameters
+	----------
+	config : dict
+		Allow to retrieve the name of the entity and project (from config file)
+	model_checkpoint :
+		Name of the PLM used
+	annotations_file : str
+		Path to the training file
+	Returns
+	-------
+	None
+	'''
+	print("HERE WE INITIALIZE A WANDB PROJECT")
+	import wandb
+	proj_wandb = config["wandb"]
+	ent_wandbd = config["wandb_ent"]
+	# start a new wandb run to track this script
+	# The project name must be set before initializing the trainer
+	wandb.init(
+		# set the wandb project where this run will be logged
+		project=proj_wandb,
+		entity=ent_wandbd,
+		# track hyperparameters and run metadata
+		config={
+			"model_checkpoint": model_checkpoint,
+			"dataset": annotations_file,
+		}
+	)
+	wandb.define_metric("epoch")
+	wandb.define_metric("epoch")
+	wandb.define_metric("f1", step_metric="batch")
+	wandb.define_metric("f1", step_metric="epoch")
+def set_name_output_dir( output_dir, config, corpus_name ):
+	'''
+	Set the path name for the target directory used to store models. The name should contain
+	info about the task, the PLM and the hyperparameter values.
+	Parameters
+	----------
+	output_dir : str
+		Path to the output directory provided by the user
+	config: dict
+		Information of configuration
+	corpus_name: str
+		Name of the corpus
+	Returns
+	-------
+	Str: Path to the output directory
+	'''
+	# Retrieve decimal number for learning rate, to avoir scientific notation
+	hyperparam = [
+				config['trainer_config']['batch_size'],
+				np.format_float_positional(config['trainer_config']['learning_rate'])
+				  ]
+	output_dir = os.path.join( output_dir,
+				'_'.join( [
+					corpus_name,
+					config["model_name"],
+					config["task"],
+					'_'.join([str(p) for p in hyperparam])
+					] ) )
+	return output_dir