lydianish commited on
Commit
620cefc
·
verified ·
1 Parent(s): 245b6ee

Upload 2 files

Browse files
Files changed (2) hide show
  1. Dockerfile +85 -0
  2. app.py +78 -0
Dockerfile ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a base image with Conda installed
2
+ FROM centos:7
3
+ FROM python:3.9.18
4
+ FROM continuumio/miniconda3:latest
5
+
6
+ WORKDIR /app
7
+ COPY *requirements.txt /app/
8
+
9
+ # Clone the RoLASER repository into the container
10
+ RUN apt-get update && \
11
+ apt-get install -y g++ && \
12
+ apt-get install -y unzip && \
13
+ apt-get install -y git && \
14
+ git clone https://github.com/lydianish/RoLASER.git /app/RoLASER
15
+
16
+ # Create conda environment with the required Python and GCC versions
17
+ WORKDIR /app/RoLASER
18
+ COPY environment.yml /app/RoLASER/environment.yml
19
+ RUN conda env create -f environment.yml
20
+ RUN echo "conda activate rolaser_env" >> ~/.bashrc
21
+ ENV PATH /opt/conda/envs/rolaser_env/bin:$PATH
22
+ ENV ROLASER /app/RoLASER
23
+
24
+ # Install PyTorch 1.10.1
25
+ RUN pip3 install --no-cache-dir torch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html
26
+ # Install Fairseq and dependencies
27
+ RUN git clone https://github.com/lydianish/fairseq.git /app/fairseq
28
+ WORKDIR /app/fairseq
29
+ RUN git checkout rolaser
30
+ RUN pip3 install --no-cache-dir --editable .
31
+ RUN python3 setup.py build_ext --inplace
32
+ RUN pip3 install --no-cache-dir -r /app/FAIRSEQ_requirements.txt
33
+ ENV FAIRSEQ /app/fairseq
34
+
35
+
36
+ # Install LASER and dependencies
37
+ RUN git clone https://github.com/lydianish/LASER.git /app/LASER
38
+ WORKDIR /app/LASER
39
+ RUN git checkout rolaser
40
+ ENV LASER /app/LASER
41
+ RUN bash ./install_external_tools.sh
42
+ RUN pip3 install --no-cache-dir -r /app/LASER_requirements.txt
43
+
44
+ # Install other RoLASER dependencies
45
+ WORKDIR /app/RoLASER
46
+ RUN pip3 install --no-cache-dir -r /app/ROLASER_requirements.txt
47
+
48
+ # Download models
49
+ WORKDIR /app/RoLASER/models
50
+ RUN wget https://zenodo.org/api/records/10864557/files-archive
51
+ RUN unzip files-archive
52
+
53
+ RUN mkdir $LASER/models \
54
+ && mv laser* $LASER/models \
55
+ && mkdir RoLASER \
56
+ && mv rolaser* RoLASER/ \
57
+ && mkdir c-RoLASER \
58
+ && mv c-rolaser* c-RoLASER/ \
59
+ && rm files-archive
60
+
61
+ # Set Python system path to find the Fairseq and LASER modules
62
+ ENV PYTHONPATH $PYTHONPATH:$FAIRSEQ
63
+ ENV PYTHONPATH $PYTHONPATH:$LASER/source
64
+
65
+ # Set up a new user named "user" with user ID 1000
66
+ RUN useradd -m -u 1000 user
67
+ # Switch to the "user" user
68
+ USER user
69
+ # Set home to the user's home directory
70
+ ENV HOME /home/user
71
+ ENV PATH $HOME/.local/bin:$PATH
72
+
73
+ WORKDIR $HOME
74
+ RUN mkdir app
75
+ # Set the working directory to the user's home directory
76
+ WORKDIR $HOME/app
77
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
78
+ COPY --chown=user . $HOME/app
79
+
80
+ EXPOSE 7860
81
+ CMD streamlit run app.py \
82
+ --server.headless true \
83
+ --server.enableCORS false \
84
+ --server.enableXsrfProtection false \
85
+ --server.fileWatcherType none
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.metrics.pairwise import paired_cosine_distances
6
+ from sklearn.preprocessing import normalize
7
+ from rolaser import RoLaserEncoder
8
+
9
+ laser_checkpoint = f"{os.environ['LASER']}/models/laser2.pt"
10
+ laser_vocab = f"{os.environ['LASER']}/models/laser2.cvocab"
11
+ laser_tokenizer = 'spm'
12
+ laser_model = RoLaserEncoder(model_path=laser_checkpoint, vocab=laser_vocab, tokenizer=laser_tokenizer)
13
+
14
+ rolaser_checkpoint = f"{os.environ['ROLASER']}/models/RoLASER/rolaser.pt"
15
+ rolaser_vocab = f"{os.environ['ROLASER']}/models/RoLASER/rolaser.cvocab"
16
+ rolaser_tokenizer = 'roberta'
17
+ rolaser_model = RoLaserEncoder(model_path=rolaser_checkpoint, vocab=rolaser_vocab, tokenizer=rolaser_tokenizer)
18
+
19
+ c_rolaser_checkpoint = f"{os.environ['ROLASER']}/models/c-RoLASER/c-rolaser.pt"
20
+ c_rolaser_vocab = f"{os.environ['ROLASER']}/models/c-RoLASER/c-rolaser.cvocab"
21
+ c_rolaser_tokenizer = 'char'
22
+ c_rolaser_model = RoLaserEncoder(model_path=c_rolaser_checkpoint, vocab=c_rolaser_vocab, tokenizer=c_rolaser_tokenizer)
23
+
24
+ def add_text_inputs(i):
25
+ col1, col2 = st.columns(2)
26
+ with col1:
27
+ text_input1 = st.text_input('Enter standard text here:', f'std{i}')
28
+ with col2:
29
+ text_input2 = st.text_input('Enter non-standard text here:', f'ugc{i}')
30
+ return text_input1, text_input2
31
+
32
+ def main():
33
+ st.title('Pairwise Cosine Distance Calculator')
34
+
35
+ num_pairs = st.sidebar.number_input('Number of Text Input Pairs', min_value=1, max_value=10, value=1)
36
+
37
+ std_text_inputs = []
38
+ ugc_text_inputs = []
39
+ for i in range(num_pairs):
40
+ pair = add_text_inputs(i)
41
+ std_text_inputs.append(pair[0])
42
+ ugc_text_inputs.append(pair[1])
43
+
44
+ if st.button('Add Text Input Pair'):
45
+ pair = add_text_inputs(len(std_text_inputs))
46
+ std_text_inputs.append(pair[0])
47
+ ugc_text_inputs.append(pair[1])
48
+
49
+ if st.button('Submit'):
50
+ X_std_laser = normalize(laser_model.encode(std_text_inputs))
51
+ X_ugc_laser = normalize(laser_model.encode(ugc_text_inputs))
52
+ X_cos_laser = paired_cosine_distances(X_std_laser, X_ugc_laser)
53
+
54
+ X_std_rolaser = normalize(rolaser_model.encode(std_text_inputs))
55
+ X_ugc_rolaser = normalize(rolaser_model.encode(ugc_text_inputs))
56
+ X_cos_rolaser = paired_cosine_distances(X_std_rolaser, X_ugc_rolaser)
57
+
58
+ X_std_c_rolaser = normalize(c_rolaser_model.encode(std_text_inputs))
59
+ X_ugc_c_rolaser = normalize(c_rolaser_model.encode(ugc_text_inputs))
60
+ X_cos_c_rolaser = paired_cosine_distances(X_std_c_rolaser, X_ugc_c_rolaser)
61
+
62
+ outputs = pd.DataFrame(columns=[ 'model', 'pair', 'ugc', 'std', 'cos'])
63
+ outputs['model'] = np.repeat(['LASER', 'RoLASER', 'C-RoLASER'], 3)
64
+ outputs['pair'] = np.tile(np.arange(1,num_pairs+1), 3)
65
+ outputs['std'] = np.tile(std_text_inputs, 3)
66
+ outputs['ugc'] = np.tile(ugc_text_inputs, 3)
67
+ outputs['cos'] = np.concatenate([X_cos_laser, X_cos_rolaser, X_cos_c_rolaser], axis=1)
68
+
69
+ st.write('## Cosine Distance Scores:')
70
+ st.bar_chart(outputs, x='pair', y='cos', color='model', title='Cosine Distance Scores', xlabel='Text Input Pair', ylabel='Cosine Distance', legend='Model')
71
+
72
+ st.write('## Average Cosine Distance Scores:')
73
+ st.write(f'LASER: {outputs[outputs["model"]=="LASER"]["cos"].mean()}')
74
+ st.write(f'RoLASER: {outputs[outputs["model"]=="RoLASER"]["cos"].mean()}')
75
+ st.write(f'C-RoLASER: {outputs[outputs["model"]=="C-RoLASER"]["cos"].mean()}')
76
+
77
+ if __name__ == "__main__":
78
+ main()