Spaces:
Runtime error
Runtime error
Fred Serfati
commited on
Commit
•
8c933a6
1
Parent(s):
dd5406a
Initial commit
Browse files- app_bert.py +57 -0
- bert.py +31 -0
- model_bert_2.bin +3 -0
- requirements.txt +144 -0
- text_preprocessing.py +104 -0
app_bert.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
from transformers import RobertaTokenizer
|
4 |
+
from bert import RobertaClass
|
5 |
+
from text_preprocessing import preprocess_text
|
6 |
+
|
7 |
+
# Load the fine-tuned BERT model
|
8 |
+
model = RobertaClass()
|
9 |
+
model.load_state_dict(torch.load('model_bert_2.bin',
|
10 |
+
map_location=torch.device('cpu')))
|
11 |
+
|
12 |
+
# Load the tokenizer
|
13 |
+
tokenizer = RobertaTokenizer.from_pretrained(
|
14 |
+
'roberta-base', truncation=True, do_lower_case=True)
|
15 |
+
|
16 |
+
# Define the user interface
|
17 |
+
st.title('ChatGPT detector')
|
18 |
+
text_input = st.text_input('Enter text to classify:', '')
|
19 |
+
submit_button = st.button('Classify')
|
20 |
+
|
21 |
+
# Define prediction function
|
22 |
+
|
23 |
+
|
24 |
+
def predict(text):
|
25 |
+
'''Predicts the label and confidence level of the input text.'''
|
26 |
+
# Preprocess the input text
|
27 |
+
text_preprocessed = preprocess_text(text)
|
28 |
+
|
29 |
+
# Tokenize the preprocessed text
|
30 |
+
inputs = tokenizer(text_preprocessed, return_tensors='pt',
|
31 |
+
padding=True, truncation=True)
|
32 |
+
inputs.pop('token_type_ids', None) # Remove token_type_ids
|
33 |
+
|
34 |
+
# Perform inference
|
35 |
+
with torch.no_grad():
|
36 |
+
outputs = model(input_ids=inputs['input_ids'],
|
37 |
+
attention_mask=inputs['attention_mask'])
|
38 |
+
|
39 |
+
# Convert output to probabilities and predicted label
|
40 |
+
# probability of positive class
|
41 |
+
predicted_prob = torch.sigmoid(outputs).item()
|
42 |
+
predicted_label = 1 if predicted_prob >= 0.5 else 0
|
43 |
+
if predicted_label == 0:
|
44 |
+
predicted_prob = 1 - predicted_prob
|
45 |
+
|
46 |
+
return predicted_label, predicted_prob
|
47 |
+
|
48 |
+
|
49 |
+
# Handle user interaction
|
50 |
+
if submit_button:
|
51 |
+
predicted_label, predicted_prob = predict(text_input)
|
52 |
+
# Assuming binary classification
|
53 |
+
labels = ['written by a human', 'generated by ChatGPT']
|
54 |
+
predicted_category = labels[predicted_label]
|
55 |
+
predicted_prob_percentage = round(predicted_prob * 100, 2)
|
56 |
+
st.write(
|
57 |
+
f"This text was {predicted_category} ({predicted_prob_percentage} % confident)")
|
bert.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import RobertaModel
|
3 |
+
|
4 |
+
class RobertaClass(torch.nn.Module):
|
5 |
+
def __init__(self):
|
6 |
+
super(RobertaClass, self).__init__()
|
7 |
+
# Load pre-trained RobertaModel
|
8 |
+
self.roberta = RobertaModel.from_pretrained("roberta-base")
|
9 |
+
# Define pre-classifier layer
|
10 |
+
self.pre_classifier = torch.nn.Linear(768, 768)
|
11 |
+
# Define dropout layer
|
12 |
+
self.dropout = torch.nn.Dropout(0.3)
|
13 |
+
# Define classifier layer
|
14 |
+
self.classifier = torch.nn.Linear(768, 1)
|
15 |
+
|
16 |
+
def forward(self, input_ids, attention_mask):
|
17 |
+
'''Forward pass of the model'''
|
18 |
+
# Perform forward pass through RobertaModel
|
19 |
+
output_1 = self.roberta(input_ids=input_ids,
|
20 |
+
attention_mask=attention_mask)
|
21 |
+
hidden_state = output_1[0]
|
22 |
+
pooler = hidden_state[:, 0]
|
23 |
+
# Apply pre-classifier layer
|
24 |
+
pooler = self.pre_classifier(pooler)
|
25 |
+
# Apply ReLU activation function
|
26 |
+
pooler = torch.nn.ReLU()(pooler)
|
27 |
+
# Apply dropout
|
28 |
+
pooler = self.dropout(pooler)
|
29 |
+
# Apply classifier layer
|
30 |
+
output = self.classifier(pooler)
|
31 |
+
return output
|
model_bert_2.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d38b95d7aa7a1eb2d9887da990527ab7b329f15da45433dd995886f0a65bea23
|
3 |
+
size 501032551
|
requirements.txt
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b1m9c8aqie/croot/aiohttp_1707342290832/work
|
2 |
+
aiosignal @ file:///tmp/build/80754af9/aiosignal_1637843061372/work
|
3 |
+
altair @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a8x4081_4h/croot/altair_1687526044471/work
|
4 |
+
annotated-types @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1fa2djihwb/croot/annotated-types_1709542925772/work
|
5 |
+
appnope @ file:///Users/cbousseau/work/recipes/ci_py311/appnope_1677917710869/work
|
6 |
+
asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
|
7 |
+
attrs @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_224434dqzl/croot/attrs_1695717839274/work
|
8 |
+
beautifulsoup4 @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_fa78jvo_0n/croot/beautifulsoup4-split_1681493044306/work
|
9 |
+
blinker @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_d64vvpf_j3/croot/blinker_1696539070114/work
|
10 |
+
blis @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_e15lfnq3gf/croot/cython-blis_1684139875892/work
|
11 |
+
Bottleneck @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_2bxpizxa3c/croot/bottleneck_1707864819812/work
|
12 |
+
Brotli @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_27zk0eqdh0/croot/brotli-split_1714483157007/work
|
13 |
+
cachetools @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_6a4ekiifd5/croot/cachetools_1713977095290/work
|
14 |
+
catalogue @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_80m8_830f9/croot/catalogue_1703688152663/work
|
15 |
+
catboost @ https://pypi.org/packages/cp311/c/catboost/catboost-1.2.5-cp311-cp311-macosx_11_0_universal2.whl#sha256=68dc17e6850f1ad8256fbbfbfac2c26b7be291c78b0961b135adf3aa6ed503bf
|
16 |
+
certifi @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_3bzbkiv4h_/croot/certifi_1707229182618/work/certifi
|
17 |
+
charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
|
18 |
+
click @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_61srkg7e2e/croot/click_1698129815327/work
|
19 |
+
cloudpathlib @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_912gf4np5z/croot/cloudpathlib_1704812278170/work
|
20 |
+
colorama @ file:///Users/cbousseau/work/recipes/ci_py311/colorama_1677925183444/work
|
21 |
+
comm @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_3doui0bmzb/croot/comm_1709322861485/work
|
22 |
+
confection @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_35m7l4lyrd/croot/confection_1703694706795/work
|
23 |
+
contourpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_041uwyxdzo/croot/contourpy_1700583585236/work
|
24 |
+
cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work
|
25 |
+
cymem @ file:///Users/cbousseau/work/recipes/ci_py311/cymem_1677931714912/work
|
26 |
+
datasets @ file:///home/conda/feedstock_root/build_artifacts/datasets_1713537449725/work
|
27 |
+
debugpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_563_nwtkoc/croot/debugpy_1690905063850/work
|
28 |
+
decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
|
29 |
+
dill @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_03bak43n9t/croot/dill_1692271244297/work
|
30 |
+
en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc
|
31 |
+
en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl#sha256=6a0f857a2b4d219c6fa17d455f82430b365bf53171a2d919b9376e5dc9be032e
|
32 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
33 |
+
executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
|
34 |
+
filelock @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_d3quwmvouf/croot/filelock_1700591194006/work
|
35 |
+
fonttools @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_60c8ux4mkl/croot/fonttools_1713551354374/work
|
36 |
+
frozenlist @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_68w5bkvw82/croot/frozenlist_1698702578544/work
|
37 |
+
fsspec @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_19mkn689lo/croot/fsspec_1714461553219/work
|
38 |
+
gitdb @ file:///tmp/build/80754af9/gitdb_1617117951232/work
|
39 |
+
GitPython @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_4a22a3qbjg/croot/gitpython_1696936997893/work
|
40 |
+
gmpy2 @ file:///Users/cbousseau/work/recipes/ci_py311/gmpy2_1677937751357/work
|
41 |
+
graphviz @ file:///Users/cbousseau/work/recipes/ci_py311/python-graphviz_1678001933790/work
|
42 |
+
huggingface_hub @ file:///home/conda/feedstock_root/build_artifacts/huggingface_hub_1714763326260/work
|
43 |
+
idna @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a12xpo84t2/croot/idna_1714398852854/work
|
44 |
+
importlib-metadata @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_5498c88e7n/croot/importlib_metadata-suite_1704813534254/work
|
45 |
+
ipykernel @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f428_5tjvx/croot/ipykernel_1705933835534/work
|
46 |
+
ipython @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a1tmxj9b4u/croot/ipython_1704833016119/work
|
47 |
+
ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1707427226251/work
|
48 |
+
jedi @ file:///Users/cbousseau/work/recipes/ci_py311_2/jedi_1678994967789/work
|
49 |
+
Jinja2 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_7dognxkzoy/croot/jinja2_1706733627811/work
|
50 |
+
joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1714665484399/work
|
51 |
+
jsonschema @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_27o3go8sqa/croot/jsonschema_1699041627313/work
|
52 |
+
jsonschema-specifications @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_d38pclgu95/croot/jsonschema-specifications_1699032390832/work
|
53 |
+
jupyter_client @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_58w2siozyz/croot/jupyter_client_1699455907045/work
|
54 |
+
jupyter_core @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_782yoyc_98/croot/jupyter_core_1698937318631/work
|
55 |
+
jupyterlab_widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1707421892171/work
|
56 |
+
kiwisolver @ file:///Users/cbousseau/work/recipes/ci_py311/kiwisolver_1677925326358/work
|
57 |
+
langcodes @ file:///opt/conda/conda-bld/langcodes_1643477751144/work
|
58 |
+
lxml @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_b1f_3r_n5v/croot/lxml_1695058169427/work
|
59 |
+
markdown-it-py @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_43l_4ajkho/croot/markdown-it-py_1684279912406/work
|
60 |
+
MarkupSafe @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a84ni4pci8/croot/markupsafe_1704206002077/work
|
61 |
+
matplotlib @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8crvoz7ca/croot/matplotlib-suite_1713336381679/work
|
62 |
+
matplotlib-inline @ file:///Users/cbousseau/work/recipes/ci_py311/matplotlib-inline_1677918241899/work
|
63 |
+
mdurl @ file:///Users/cbousseau/work/recipes/ci_py311/mdurl_1677942260967/work
|
64 |
+
mpmath @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_17iu6a8a3m/croot/mpmath_1690848269369/work
|
65 |
+
multidict @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_10voz9m15i/croot/multidict_1701096890858/work
|
66 |
+
multiprocess @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_5fxbcbjtcd/croot/multiprocess_1692294387834/work
|
67 |
+
murmurhash @ file:///Users/cbousseau/work/recipes/ci_py311/murmurhash_1677932959271/work
|
68 |
+
nest-asyncio @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_310vb5e2a0/croot/nest-asyncio_1708532678212/work
|
69 |
+
networkx @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b9af3smw_7/croot/networkx_1690562010704/work
|
70 |
+
numexpr @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_45yefq0kt6/croot/numexpr_1696515289183/work
|
71 |
+
numpy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a51i_mbs7m/croot/numpy_and_numpy_base_1708638620867/work/dist/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl#sha256=3d90dd3382cff7becb2384f73058a8e72b81c697e8bb77f1c69a82caca5b0c57
|
72 |
+
packaging @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a6lqg7at4g/croot/packaging_1710807410750/work
|
73 |
+
pandas @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_988rx6x546/croot/pandas_1709590494976/work/dist/pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl#sha256=76c40380edef2477387870b6c2ccb59c14e52583ec49c5e254f3e74d918eb901
|
74 |
+
parso @ file:///opt/conda/conda-bld/parso_1641458642106/work
|
75 |
+
pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
|
76 |
+
pillow @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_bdozlpra65/croot/pillow_1714398852206/work
|
77 |
+
platformdirs @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8u4fy8k9o/croot/platformdirs_1692205661656/work
|
78 |
+
plotly @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_1fqquy5q2e/croot/plotly_1708976902906/work
|
79 |
+
preshed @ file:///Users/cbousseau/work/recipes/ci_py311/preshed_1677953474614/work
|
80 |
+
prompt-toolkit @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_c63v4kqjzr/croot/prompt-toolkit_1704404354115/work
|
81 |
+
protobuf==3.20.3
|
82 |
+
psutil @ file:///Users/cbousseau/work/recipes/ci_py311_2/psutil_1678995687212/work
|
83 |
+
ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
|
84 |
+
pure-eval @ file:///opt/conda/conda-bld/pure_eval_1646925070566/work
|
85 |
+
pyarrow @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_7dissav55a/croot/pyarrow_1707330837493/work/python
|
86 |
+
pyarrow-hotfix @ file:///home/conda/feedstock_root/build_artifacts/pyarrow-hotfix_1700596371886/work
|
87 |
+
pydantic @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_0ai8cvgm2c/croot/pydantic_1709577986211/work
|
88 |
+
pydantic_core @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_06smitnu98/croot/pydantic-core_1709573985903/work
|
89 |
+
pydeck @ file:///home/conda/feedstock_root/build_artifacts/pydeck_1667589451974/work
|
90 |
+
Pygments @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_29bs9f_dh9/croot/pygments_1684279974747/work
|
91 |
+
pyparsing @ file:///Users/cbousseau/work/recipes/ci_py311/pyparsing_1677910832141/work
|
92 |
+
PySocks @ file:///Users/cbousseau/work/recipes/ci_py311/pysocks_1677906386870/work
|
93 |
+
python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
|
94 |
+
pytz @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a4b76c83ik/croot/pytz_1713974318928/work
|
95 |
+
PyYAML @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_a8_sdgulmz/croot/pyyaml_1698096054705/work
|
96 |
+
pyzmq @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_43pxpbos3z/croot/pyzmq_1705605108344/work
|
97 |
+
referencing @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_5cz64gsx70/croot/referencing_1699012046031/work
|
98 |
+
regex @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_c10w1rynu_/croot/regex_1696515309790/work
|
99 |
+
requests @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_b3tnputioh/croot/requests_1707355573919/work
|
100 |
+
rich @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f2payco56o/croot/rich_1684282180753/work
|
101 |
+
rpds-py @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_f8jkozoefm/croot/rpds-py_1698945944860/work
|
102 |
+
safetensors @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_09qdt_s9t7/croot/safetensors_1708633848061/work
|
103 |
+
scikit-learn @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_6f_fxtrklc/croot/scikit-learn_1714164747354/work
|
104 |
+
scipy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_6f17pyifw3/croot/scipy_1714069789658/work/dist/scipy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl#sha256=0f444a1500e173f20c1ba9d8651956d5b409ae14c1a40ca96c3e7d952f2fa6d0
|
105 |
+
seaborn @ file:///Users/cbousseau/work/recipes/ci_py311/seaborn_1677961968762/work
|
106 |
+
shellingham @ file:///Users/cbousseau/work/recipes/ci_py311/shellingham_1677910973388/work
|
107 |
+
six @ file:///tmp/build/80754af9/six_1644875935023/work
|
108 |
+
smart-open @ file:///Users/cbousseau/work/recipes/ci_py311/smart_open_1677955621457/work
|
109 |
+
smmap @ file:///tmp/build/80754af9/smmap_1611694433573/work
|
110 |
+
soupsieve @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_9798xzs_03/croot/soupsieve_1696347567192/work
|
111 |
+
spacy @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_18ske9dh81/croot/spacy_1704840104347/work
|
112 |
+
spacy-legacy @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_f7_dx1c18z/croot/spacy-legacy_1684141123193/work
|
113 |
+
spacy-loggers @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_68k4nvpzja/croot/spacy-loggers_1684138716415/work
|
114 |
+
srsly @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_38h6t42apc/croot/srsly_1703692316557/work
|
115 |
+
stack-data @ file:///opt/conda/conda-bld/stack_data_1646927590127/work
|
116 |
+
streamlit @ file:///home/conda/feedstock_root/build_artifacts/streamlit_1714694115633/work
|
117 |
+
sympy @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_7cbpg8656h/croot/sympy_1701397648473/work
|
118 |
+
tenacity @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_0ew5sfng29/croot/tenacity_1682972282256/work
|
119 |
+
thinc @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_49df34smdw/croot/thinc_1704703973981/work
|
120 |
+
threadpoolctl @ file:///Users/ktietz/demo/mc3/conda-bld/threadpoolctl_1629802263681/work
|
121 |
+
tokenizers @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_77bzam0w9g/croot/tokenizers_1708633828244/work
|
122 |
+
toml @ file:///tmp/build/80754af9/toml_1616166611790/work
|
123 |
+
toolz @ file:///Users/cbousseau/work/recipes/ci_py311/toolz_1677925870232/work
|
124 |
+
torch==2.3.0
|
125 |
+
tornado @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_3a5nrn2jeh/croot/tornado_1696936974091/work
|
126 |
+
tqdm @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_72w005t9uu/croot/tqdm_1714567716531/work
|
127 |
+
traitlets @ file:///Users/cbousseau/work/recipes/ci_py311/traitlets_1677911650502/work
|
128 |
+
transformers @ file:///home/conda/feedstock_root/build_artifacts/transformers_1709308155748/work
|
129 |
+
typer @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_e62p5mo8z5/croot/typer_1684251930377/work
|
130 |
+
typing_extensions @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_6ejdc7ufhc/croot/typing_extensions_1705599306111/work
|
131 |
+
tzdata @ file:///croot/python-tzdata_1690578112552/work
|
132 |
+
tzlocal @ file:///Users/cbousseau/work/recipes/ci_py311/tzlocal_1677955771765/work
|
133 |
+
unicodedata2 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a3epjto7gs/croot/unicodedata2_1713212955584/work
|
134 |
+
urllib3 @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_aff2m3lasf/croot/urllib3_1707770561896/work
|
135 |
+
validators @ file:///tmp/build/80754af9/validators_1612286467315/work
|
136 |
+
wasabi @ file:///Users/cbousseau/work/recipes/ci_py311/wasabi_1677955837938/work
|
137 |
+
watchdog @ file:///Users/cbousseau/work/recipes/ci_py311/watchdog_1677963700938/work
|
138 |
+
wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
|
139 |
+
weasel @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_e2vc1q9m94/croot/weasel_1704815771776/work
|
140 |
+
widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1707420319466/work
|
141 |
+
xgboost @ file:///home/conda/feedstock_root/build_artifacts/xgboost-split_1713397725960/work/python-package
|
142 |
+
xxhash @ file:///Users/cbousseau/work/recipes/ci_py311/python-xxhash_1677954188023/work
|
143 |
+
yarl @ file:///private/var/folders/k1/30mswbxs7r1g6zwn8y4fyt500000gp/T/abs_a8s46tbbn5/croot/yarl_1701105147904/work
|
144 |
+
zipp @ file:///private/var/folders/nz/j6p8yfhx1mv_0grj5xl4650h0000gp/T/abs_31jm3q76eq/croot/zipp_1704206913245/work
|
text_preprocessing.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
|
5 |
+
|
6 |
+
contraction_mapping = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
|
7 |
+
"didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
|
8 |
+
"he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
|
9 |
+
"how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have", "I'm": "I am",
|
10 |
+
"I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am",
|
11 |
+
"i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
|
12 |
+
"it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
|
13 |
+
"mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
|
14 |
+
"needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
|
15 |
+
"sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
|
16 |
+
"she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
|
17 |
+
"so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would", "that'd've": "that would have", "that's": "that is",
|
18 |
+
"there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is", "they'd": "they would",
|
19 |
+
"they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
|
20 |
+
"to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
|
21 |
+
"we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
|
22 |
+
"what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
|
23 |
+
"where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is",
|
24 |
+
"who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
|
25 |
+
"would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
|
26 |
+
"y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", "you'd've": "you would have",
|
27 |
+
"you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s': 'america', 'e.g': 'for example'}
|
28 |
+
|
29 |
+
punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£',
|
30 |
+
'·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
|
31 |
+
'“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─',
|
32 |
+
'▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞',
|
33 |
+
'∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
|
34 |
+
|
35 |
+
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
|
36 |
+
"`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-',
|
37 |
+
'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!': ' '}
|
38 |
+
|
39 |
+
|
40 |
+
def clean_text(text):
|
41 |
+
'''Make text lowercase, remove text in square brackets, remove links, remove punctuation,
|
42 |
+
and remove words containing numbers.'''
|
43 |
+
text = re.sub(r'\:(.*?)\:', '', text)
|
44 |
+
text = str(text).lower() # Making Text Lowercase
|
45 |
+
text = re.sub('\[.*?\]', '', text)
|
46 |
+
# The next 2 lines remove html text
|
47 |
+
text = BeautifulSoup(text, 'lxml').get_text()
|
48 |
+
text = re.sub('https?://\S+|www\.\S+', '', text)
|
49 |
+
text = re.sub('<.*?>+', '', text)
|
50 |
+
text = re.sub('\n', '', text)
|
51 |
+
text = re.sub('\w*\d\w*', '', text)
|
52 |
+
# replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
|
53 |
+
text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
|
54 |
+
return text
|
55 |
+
|
56 |
+
|
57 |
+
def clean_contractions(text, mapping):
|
58 |
+
'''Clean contraction using contraction mapping'''
|
59 |
+
specials = ["’", "‘", "´", "`"]
|
60 |
+
for s in specials:
|
61 |
+
text = text.replace(s, "'")
|
62 |
+
for word in mapping.keys():
|
63 |
+
if "" + word + "" in text:
|
64 |
+
text = text.replace("" + word + "", "" + mapping[word] + "")
|
65 |
+
# Remove Punctuations
|
66 |
+
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
|
67 |
+
# creating a space between a word and the punctuation following it
|
68 |
+
# eg: "he is a boy." => "he is a boy ."
|
69 |
+
text = re.sub(r"([?.!,¿])", r" \1 ", text)
|
70 |
+
text = re.sub(r'[" "]+', " ", text)
|
71 |
+
return text
|
72 |
+
|
73 |
+
|
74 |
+
def clean_special_chars(text, punct, mapping):
|
75 |
+
'''Cleans special characters present (if any)'''
|
76 |
+
for p in mapping:
|
77 |
+
text = text.replace(p, mapping[p])
|
78 |
+
|
79 |
+
for p in punct:
|
80 |
+
text = text.replace(p, f' {p} ')
|
81 |
+
|
82 |
+
specials = {'\u200b': ' ', '…': ' ... ',
|
83 |
+
'\ufeff': '', 'करना': '', 'है': ''}
|
84 |
+
for s in specials:
|
85 |
+
text = text.replace(s, specials[s])
|
86 |
+
|
87 |
+
return text
|
88 |
+
|
89 |
+
|
90 |
+
def remove_space(text):
|
91 |
+
'''Removes awkward spaces'''
|
92 |
+
# Removes awkward spaces
|
93 |
+
text = text.strip()
|
94 |
+
text = text.split()
|
95 |
+
return " ".join(text)
|
96 |
+
|
97 |
+
|
98 |
+
def preprocess_text(text):
|
99 |
+
'''Cleaning and parsing the text.'''
|
100 |
+
text = clean_text(text)
|
101 |
+
text = clean_contractions(text, contraction_mapping)
|
102 |
+
text = clean_special_chars(text, punct, punct_mapping)
|
103 |
+
text = remove_space(text)
|
104 |
+
return text
|