Fred Serfati commited on
Commit
8c933a6
1 Parent(s): dd5406a

Initial commit

Browse files
Files changed (5) hide show
  1. app_bert.py +57 -0
  2. bert.py +31 -0
  3. model_bert_2.bin +3 -0
  4. requirements.txt +144 -0
  5. text_preprocessing.py +104 -0
app_bert.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import RobertaTokenizer
4
+ from bert import RobertaClass
5
+ from text_preprocessing import preprocess_text
6
+
7
+ # Load the fine-tuned BERT model
8
+ model = RobertaClass()
9
+ model.load_state_dict(torch.load('model_bert_2.bin',
10
+ map_location=torch.device('cpu')))
11
+
12
+ # Load the tokenizer
13
+ tokenizer = RobertaTokenizer.from_pretrained(
14
+ 'roberta-base', truncation=True, do_lower_case=True)
15
+
16
+ # Define the user interface
17
+ st.title('ChatGPT detector')
18
+ text_input = st.text_input('Enter text to classify:', '')
19
+ submit_button = st.button('Classify')
20
+
21
+ # Define prediction function
22
+
23
+
24
+ def predict(text):
25
+ '''Predicts the label and confidence level of the input text.'''
26
+ # Preprocess the input text
27
+ text_preprocessed = preprocess_text(text)
28
+
29
+ # Tokenize the preprocessed text
30
+ inputs = tokenizer(text_preprocessed, return_tensors='pt',
31
+ padding=True, truncation=True)
32
+ inputs.pop('token_type_ids', None) # Remove token_type_ids
33
+
34
+ # Perform inference
35
+ with torch.no_grad():
36
+ outputs = model(input_ids=inputs['input_ids'],
37
+ attention_mask=inputs['attention_mask'])
38
+
39
+ # Convert output to probabilities and predicted label
40
+ # probability of positive class
41
+ predicted_prob = torch.sigmoid(outputs).item()
42
+ predicted_label = 1 if predicted_prob >= 0.5 else 0
43
+ if predicted_label == 0:
44
+ predicted_prob = 1 - predicted_prob
45
+
46
+ return predicted_label, predicted_prob
47
+
48
+
49
+ # Handle user interaction
50
+ if submit_button:
51
+ predicted_label, predicted_prob = predict(text_input)
52
+ # Assuming binary classification
53
+ labels = ['written by a human', 'generated by ChatGPT']
54
+ predicted_category = labels[predicted_label]
55
+ predicted_prob_percentage = round(predicted_prob * 100, 2)
56
+ st.write(
57
+ f"This text was {predicted_category} ({predicted_prob_percentage} % confident)")
bert.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import RobertaModel
3
+
4
+ class RobertaClass(torch.nn.Module):
5
+ def __init__(self):
6
+ super(RobertaClass, self).__init__()
7
+ # Load pre-trained RobertaModel
8
+ self.roberta = RobertaModel.from_pretrained("roberta-base")
9
+ # Define pre-classifier layer
10
+ self.pre_classifier = torch.nn.Linear(768, 768)
11
+ # Define dropout layer
12
+ self.dropout = torch.nn.Dropout(0.3)
13
+ # Define classifier layer
14
+ self.classifier = torch.nn.Linear(768, 1)
15
+
16
+ def forward(self, input_ids, attention_mask):
17
+ '''Forward pass of the model'''
18
+ # Perform forward pass through RobertaModel
19
+ output_1 = self.roberta(input_ids=input_ids,
20
+ attention_mask=attention_mask)
21
+ hidden_state = output_1[0]
22
+ pooler = hidden_state[:, 0]
23
+ # Apply pre-classifier layer
24
+ pooler = self.pre_classifier(pooler)
25
+ # Apply ReLU activation function
26
+ pooler = torch.nn.ReLU()(pooler)
27
+ # Apply dropout
28
+ pooler = self.dropout(pooler)
29
+ # Apply classifier layer
30
+ output = self.classifier(pooler)
31
+ return output
model_bert_2.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d38b95d7aa7a1eb2d9887da990527ab7b329f15da45433dd995886f0a65bea23
3
+ size 501032551
requirements.txt ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b1m9c8aqie/croot/aiohttp_1707342290832/work
2
+ aiosignal @ file:///tmp/build/80754af9/aiosignal_1637843061372/work
3
+ altair @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a8x4081_4h/croot/altair_1687526044471/work
4
+ annotated-types @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1fa2djihwb/croot/annotated-types_1709542925772/work
5
+ appnope @ file:///Users/cbousseau/work/recipes/ci_py311/appnope_1677917710869/work
6
+ asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
7
+ attrs @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_224434dqzl/croot/attrs_1695717839274/work
8
+ beautifulsoup4 @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_fa78jvo_0n/croot/beautifulsoup4-split_1681493044306/work
9
+ blinker @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_d64vvpf_j3/croot/blinker_1696539070114/work
10
+ blis @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_e15lfnq3gf/croot/cython-blis_1684139875892/work
11
+ Bottleneck @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_2bxpizxa3c/croot/bottleneck_1707864819812/work
12
+ Brotli @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_27zk0eqdh0/croot/brotli-split_1714483157007/work
13
+ cachetools @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_6a4ekiifd5/croot/cachetools_1713977095290/work
14
+ catalogue @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_80m8_830f9/croot/catalogue_1703688152663/work
15
+ catboost @ https://pypi.org/packages/cp311/c/catboost/catboost-1.2.5-cp311-cp311-macosx_11_0_universal2.whl#sha256=68dc17e6850f1ad8256fbbfbfac2c26b7be291c78b0961b135adf3aa6ed503bf
16
+ certifi @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_3bzbkiv4h_/croot/certifi_1707229182618/work/certifi
17
+ charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
18
+ click @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_61srkg7e2e/croot/click_1698129815327/work
19
+ cloudpathlib @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_912gf4np5z/croot/cloudpathlib_1704812278170/work
20
+ colorama @ file:///Users/cbousseau/work/recipes/ci_py311/colorama_1677925183444/work
21
+ comm @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_3doui0bmzb/croot/comm_1709322861485/work
22
+ confection @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_35m7l4lyrd/croot/confection_1703694706795/work
23
+ contourpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_041uwyxdzo/croot/contourpy_1700583585236/work
24
+ cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work
25
+ cymem @ file:///Users/cbousseau/work/recipes/ci_py311/cymem_1677931714912/work
26
+ datasets @ file:///home/conda/feedstock_root/build_artifacts/datasets_1713537449725/work
27
+ debugpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_563_nwtkoc/croot/debugpy_1690905063850/work
28
+ decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
29
+ dill @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_03bak43n9t/croot/dill_1692271244297/work
30
+ en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc
31
+ en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl#sha256=6a0f857a2b4d219c6fa17d455f82430b365bf53171a2d919b9376e5dc9be032e
32
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
33
+ executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
34
+ filelock @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d3quwmvouf/croot/filelock_1700591194006/work
35
+ fonttools @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_60c8ux4mkl/croot/fonttools_1713551354374/work
36
+ frozenlist @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_68w5bkvw82/croot/frozenlist_1698702578544/work
37
+ fsspec @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_19mkn689lo/croot/fsspec_1714461553219/work
38
+ gitdb @ file:///tmp/build/80754af9/gitdb_1617117951232/work
39
+ GitPython @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_4a22a3qbjg/croot/gitpython_1696936997893/work
40
+ gmpy2 @ file:///Users/cbousseau/work/recipes/ci_py311/gmpy2_1677937751357/work
41
+ graphviz @ file:///Users/cbousseau/work/recipes/ci_py311/python-graphviz_1678001933790/work
42
+ huggingface_hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1714763326260/work
43
+ idna @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a12xpo84t2/croot/idna_1714398852854/work
44
+ importlib-metadata @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_5498c88e7n/croot/importlib_metadata-suite_1704813534254/work
45
+ ipykernel @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f428_5tjvx/croot/ipykernel_1705933835534/work
46
+ ipython @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a1tmxj9b4u/croot/ipython_1704833016119/work
47
+ ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1707427226251/work
48
+ jedi @ file:///Users/cbousseau/work/recipes/ci_py311_2/jedi_1678994967789/work
49
+ Jinja2 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_7dognxkzoy/croot/jinja2_1706733627811/work
50
+ joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1714665484399/work
51
+ jsonschema @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_27o3go8sqa/croot/jsonschema_1699041627313/work
52
+ jsonschema-specifications @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_d38pclgu95/croot/jsonschema-specifications_1699032390832/work
53
+ jupyter_client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_58w2siozyz/croot/jupyter_client_1699455907045/work
54
+ jupyter_core @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_782yoyc_98/croot/jupyter_core_1698937318631/work
55
+ jupyterlab_widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1707421892171/work
56
+ kiwisolver @ file:///Users/cbousseau/work/recipes/ci_py311/kiwisolver_1677925326358/work
57
+ langcodes @ file:///opt/conda/conda-bld/langcodes_1643477751144/work
58
+ lxml @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_b1f_3r_n5v/croot/lxml_1695058169427/work
59
+ markdown-it-py @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_43l_4ajkho/croot/markdown-it-py_1684279912406/work
60
+ MarkupSafe @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a84ni4pci8/croot/markupsafe_1704206002077/work
61
+ matplotlib @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8crvoz7ca/croot/matplotlib-suite_1713336381679/work
62
+ matplotlib-inline @ file:///Users/cbousseau/work/recipes/ci_py311/matplotlib-inline_1677918241899/work
63
+ mdurl @ file:///Users/cbousseau/work/recipes/ci_py311/mdurl_1677942260967/work
64
+ mpmath @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_17iu6a8a3m/croot/mpmath_1690848269369/work
65
+ multidict @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_10voz9m15i/croot/multidict_1701096890858/work
66
+ multiprocess @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_5fxbcbjtcd/croot/multiprocess_1692294387834/work
67
+ murmurhash @ file:///Users/cbousseau/work/recipes/ci_py311/murmurhash_1677932959271/work
68
+ nest-asyncio @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_310vb5e2a0/croot/nest-asyncio_1708532678212/work
69
+ networkx @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b9af3smw_7/croot/networkx_1690562010704/work
70
+ numexpr @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_45yefq0kt6/croot/numexpr_1696515289183/work
71
+ numpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a51i_mbs7m/croot/numpy_and_numpy_base_1708638620867/work/dist/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl#sha256=3d90dd3382cff7becb2384f73058a8e72b81c697e8bb77f1c69a82caca5b0c57
72
+ packaging @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a6lqg7at4g/croot/packaging_1710807410750/work
73
+ pandas @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_988rx6x546/croot/pandas_1709590494976/work/dist/pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl#sha256=76c40380edef2477387870b6c2ccb59c14e52583ec49c5e254f3e74d918eb901
74
+ parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
75
+ pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
76
+ pillow @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_bdozlpra65/croot/pillow_1714398852206/work
77
+ platformdirs @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8u4fy8k9o/croot/platformdirs_1692205661656/work
78
+ plotly @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1fqquy5q2e/croot/plotly_1708976902906/work
79
+ preshed @ file:///Users/cbousseau/work/recipes/ci_py311/preshed_1677953474614/work
80
+ prompt-toolkit @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_c63v4kqjzr/croot/prompt-toolkit_1704404354115/work
81
+ protobuf==3.20.3
82
+ psutil @ file:///Users/cbousseau/work/recipes/ci_py311_2/psutil_1678995687212/work
83
+ ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
84
+ pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
85
+ pyarrow @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_7dissav55a/croot/pyarrow_1707330837493/work/python
86
+ pyarrow-hotfix @ file:///home/conda/feedstock_root/build_artifacts/pyarrow-hotfix_1700596371886/work
87
+ pydantic @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_0ai8cvgm2c/croot/pydantic_1709577986211/work
88
+ pydantic_core @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_06smitnu98/croot/pydantic-core_1709573985903/work
89
+ pydeck @ file:///home/conda/feedstock_root/build_artifacts/pydeck_1667589451974/work
90
+ Pygments @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_29bs9f_dh9/croot/pygments_1684279974747/work
91
+ pyparsing @ file:///Users/cbousseau/work/recipes/ci_py311/pyparsing_1677910832141/work
92
+ PySocks @ file:///Users/cbousseau/work/recipes/ci_py311/pysocks_1677906386870/work
93
+ python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
94
+ pytz @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a4b76c83ik/croot/pytz_1713974318928/work
95
+ PyYAML @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a8_sdgulmz/croot/pyyaml_1698096054705/work
96
+ pyzmq @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_43pxpbos3z/croot/pyzmq_1705605108344/work
97
+ referencing @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_5cz64gsx70/croot/referencing_1699012046031/work
98
+ regex @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_c10w1rynu_/croot/regex_1696515309790/work
99
+ requests @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b3tnputioh/croot/requests_1707355573919/work
100
+ rich @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f2payco56o/croot/rich_1684282180753/work
101
+ rpds-py @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_f8jkozoefm/croot/rpds-py_1698945944860/work
102
+ safetensors @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_09qdt_s9t7/croot/safetensors_1708633848061/work
103
+ scikit-learn @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_6f_fxtrklc/croot/scikit-learn_1714164747354/work
104
+ scipy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_6f17pyifw3/croot/scipy_1714069789658/work/dist/scipy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl#sha256=0f444a1500e173f20c1ba9d8651956d5b409ae14c1a40ca96c3e7d952f2fa6d0
105
+ seaborn @ file:///Users/cbousseau/work/recipes/ci_py311/seaborn_1677961968762/work
106
+ shellingham @ file:///Users/cbousseau/work/recipes/ci_py311/shellingham_1677910973388/work
107
+ six @ file:///tmp/build/80754af9/six_1644875935023/work
108
+ smart-open @ file:///Users/cbousseau/work/recipes/ci_py311/smart_open_1677955621457/work
109
+ smmap @ file:///tmp/build/80754af9/smmap_1611694433573/work
110
+ soupsieve @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_9798xzs_03/croot/soupsieve_1696347567192/work
111
+ spacy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_18ske9dh81/croot/spacy_1704840104347/work
112
+ spacy-legacy @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f7_dx1c18z/croot/spacy-legacy_1684141123193/work
113
+ spacy-loggers @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_68k4nvpzja/croot/spacy-loggers_1684138716415/work
114
+ srsly @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_38h6t42apc/croot/srsly_1703692316557/work
115
+ stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work
116
+ streamlit @ file:///home/conda/feedstock_root/build_artifacts/streamlit_1714694115633/work
117
+ sympy @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_7cbpg8656h/croot/sympy_1701397648473/work
118
+ tenacity @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_0ew5sfng29/croot/tenacity_1682972282256/work
119
+ thinc @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_49df34smdw/croot/thinc_1704703973981/work
120
+ threadpoolctl @ file:///Users/ktietz/demo/mc3/conda-bld/threadpoolctl_1629802263681/work
121
+ tokenizers @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_77bzam0w9g/croot/tokenizers_1708633828244/work
122
+ toml @ file:///tmp/build/80754af9/toml_1616166611790/work
123
+ toolz @ file:///Users/cbousseau/work/recipes/ci_py311/toolz_1677925870232/work
124
+ torch==2.3.0
125
+ tornado @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_3a5nrn2jeh/croot/tornado_1696936974091/work
126
+ tqdm @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_72w005t9uu/croot/tqdm_1714567716531/work
127
+ traitlets @ file:///Users/cbousseau/work/recipes/ci_py311/traitlets_1677911650502/work
128
+ transformers @ file:///home/conda/feedstock_root/build_artifacts/transformers_1709308155748/work
129
+ typer @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_e62p5mo8z5/croot/typer_1684251930377/work
130
+ typing_extensions @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_6ejdc7ufhc/croot/typing_extensions_1705599306111/work
131
+ tzdata @ file:///croot/python-tzdata_1690578112552/work
132
+ tzlocal @ file:///Users/cbousseau/work/recipes/ci_py311/tzlocal_1677955771765/work
133
+ unicodedata2 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a3epjto7gs/croot/unicodedata2_1713212955584/work
134
+ urllib3 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_aff2m3lasf/croot/urllib3_1707770561896/work
135
+ validators @ file:///tmp/build/80754af9/validators_1612286467315/work
136
+ wasabi @ file:///Users/cbousseau/work/recipes/ci_py311/wasabi_1677955837938/work
137
+ watchdog @ file:///Users/cbousseau/work/recipes/ci_py311/watchdog_1677963700938/work
138
+ wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
139
+ weasel @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_e2vc1q9m94/croot/weasel_1704815771776/work
140
+ widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1707420319466/work
141
+ xgboost @ file:///home/conda/feedstock_root/build_artifacts/xgboost-split_1713397725960/work/python-package
142
+ xxhash @ file:///Users/cbousseau/work/recipes/ci_py311/python-xxhash_1677954188023/work
143
+ yarl @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8s46tbbn5/croot/yarl_1701105147904/work
144
+ zipp @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_31jm3q76eq/croot/zipp_1704206913245/work
text_preprocessing.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from bs4 import BeautifulSoup
4
+
5
+
6
+ contraction_mapping = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
7
+ "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
8
+ "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
9
+ "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
10
+ "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am",
11
+ "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
12
+ "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
13
+ "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
14
+ "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
15
+ "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
16
+ "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
17
+ "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would", "that'd've": "that would have", "that's": "that is",
18
+ "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would",
19
+ "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
20
+ "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
21
+ "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
22
+ "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
23
+ "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is",
24
+ "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
25
+ "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
26
+ "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
27
+ "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s': 'america', 'e.g': 'for example'}
28
+
29
+ punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£',
30
+ '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
31
+ '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
32
+ '▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞',
33
+ '∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
34
+
35
+ punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
36
+ "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-',
37
+ 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!': ' '}
38
+
39
+
40
+ def clean_text(text):
41
+ '''Make text lowercase, remove text in square brackets, remove links, remove punctuation,
42
+ and remove words containing numbers.'''
43
+ text = re.sub(r'\:(.*?)\:', '', text)
44
+ text = str(text).lower() # Making Text Lowercase
45
+ text = re.sub('\[.*?\]', '', text)
46
+ # The next 2 lines remove html text
47
+ text = BeautifulSoup(text, 'lxml').get_text()
48
+ text = re.sub('https?://\S+|www\.\S+', '', text)
49
+ text = re.sub('<.*?>+', '', text)
50
+ text = re.sub('\n', '', text)
51
+ text = re.sub('\w*\d\w*', '', text)
52
+ # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
53
+ text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
54
+ return text
55
+
56
+
57
+ def clean_contractions(text, mapping):
58
+ '''Clean contraction using contraction mapping'''
59
+ specials = ["’", "‘", "´", "`"]
60
+ for s in specials:
61
+ text = text.replace(s, "'")
62
+ for word in mapping.keys():
63
+ if "" + word + "" in text:
64
+ text = text.replace("" + word + "", "" + mapping[word] + "")
65
+ # Remove Punctuations
66
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
67
+ # creating a space between a word and the punctuation following it
68
+ # eg: "he is a boy." => "he is a boy ."
69
+ text = re.sub(r"([?.!,¿])", r" \1 ", text)
70
+ text = re.sub(r'[" "]+', " ", text)
71
+ return text
72
+
73
+
74
+ def clean_special_chars(text, punct, mapping):
75
+ '''Cleans special characters present (if any)'''
76
+ for p in mapping:
77
+ text = text.replace(p, mapping[p])
78
+
79
+ for p in punct:
80
+ text = text.replace(p, f' {p} ')
81
+
82
+ specials = {'\u200b': ' ', '…': ' ... ',
83
+ '\ufeff': '', 'करना': '', 'है': ''}
84
+ for s in specials:
85
+ text = text.replace(s, specials[s])
86
+
87
+ return text
88
+
89
+
90
+ def remove_space(text):
91
+ '''Removes awkward spaces'''
92
+ # Removes awkward spaces
93
+ text = text.strip()
94
+ text = text.split()
95
+ return " ".join(text)
96
+
97
+
98
+ def preprocess_text(text):
99
+ '''Cleaning and parsing the text.'''
100
+ text = clean_text(text)
101
+ text = clean_contractions(text, contraction_mapping)
102
+ text = clean_special_chars(text, punct, punct_mapping)
103
+ text = remove_space(text)
104
+ return text