Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- Dockerfile +85 -0
- app.py +78 -0
Dockerfile
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use a base image with Conda installed
|
2 |
+
FROM centos:7
|
3 |
+
FROM python:3.9.18
|
4 |
+
FROM continuumio/miniconda3:latest
|
5 |
+
|
6 |
+
WORKDIR /app
|
7 |
+
COPY *requirements.txt /app/
|
8 |
+
|
9 |
+
# Clone the RoLASER repository into the container
|
10 |
+
RUN apt-get update && \
|
11 |
+
apt-get install -y g++ && \
|
12 |
+
apt-get install -y unzip && \
|
13 |
+
apt-get install -y git && \
|
14 |
+
git clone https://github.com/lydianish/RoLASER.git /app/RoLASER
|
15 |
+
|
16 |
+
# Create conda environment with the required Python and GCC versions
|
17 |
+
WORKDIR /app/RoLASER
|
18 |
+
COPY environment.yml /app/RoLASER/environment.yml
|
19 |
+
RUN conda env create -f environment.yml
|
20 |
+
RUN echo "conda activate rolaser_env" >> ~/.bashrc
|
21 |
+
ENV PATH /opt/conda/envs/rolaser_env/bin:$PATH
|
22 |
+
ENV ROLASER /app/RoLASER
|
23 |
+
|
24 |
+
# Install PyTorch 1.10.1
|
25 |
+
RUN pip3 install --no-cache-dir torch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html
|
26 |
+
# Install Fairseq and dependencies
|
27 |
+
RUN git clone https://github.com/lydianish/fairseq.git /app/fairseq
|
28 |
+
WORKDIR /app/fairseq
|
29 |
+
RUN git checkout rolaser
|
30 |
+
RUN pip3 install --no-cache-dir --editable .
|
31 |
+
RUN python3 setup.py build_ext --inplace
|
32 |
+
RUN pip3 install --no-cache-dir -r /app/FAIRSEQ_requirements.txt
|
33 |
+
ENV FAIRSEQ /app/fairseq
|
34 |
+
|
35 |
+
|
36 |
+
# Install LASER and dependencies
|
37 |
+
RUN git clone https://github.com/lydianish/LASER.git /app/LASER
|
38 |
+
WORKDIR /app/LASER
|
39 |
+
RUN git checkout rolaser
|
40 |
+
ENV LASER /app/LASER
|
41 |
+
RUN bash ./install_external_tools.sh
|
42 |
+
RUN pip3 install --no-cache-dir -r /app/LASER_requirements.txt
|
43 |
+
|
44 |
+
# Install other RoLASER dependencies
|
45 |
+
WORKDIR /app/RoLASER
|
46 |
+
RUN pip3 install --no-cache-dir -r /app/ROLASER_requirements.txt
|
47 |
+
|
48 |
+
# Download models
|
49 |
+
WORKDIR /app/RoLASER/models
|
50 |
+
RUN wget https://zenodo.org/api/records/10864557/files-archive
|
51 |
+
RUN unzip files-archive
|
52 |
+
|
53 |
+
RUN mkdir $LASER/models \
|
54 |
+
&& mv laser* $LASER/models \
|
55 |
+
&& mkdir RoLASER \
|
56 |
+
&& mv rolaser* RoLASER/ \
|
57 |
+
&& mkdir c-RoLASER \
|
58 |
+
&& mv c-rolaser* c-RoLASER/ \
|
59 |
+
&& rm files-archive
|
60 |
+
|
61 |
+
# Set Python system path to find the Fairseq and LASER modules
|
62 |
+
ENV PYTHONPATH $PYTHONPATH:$FAIRSEQ
|
63 |
+
ENV PYTHONPATH $PYTHONPATH:$LASER/source
|
64 |
+
|
65 |
+
# Set up a new user named "user" with user ID 1000
|
66 |
+
RUN useradd -m -u 1000 user
|
67 |
+
# Switch to the "user" user
|
68 |
+
USER user
|
69 |
+
# Set home to the user's home directory
|
70 |
+
ENV HOME /home/user
|
71 |
+
ENV PATH $HOME/.local/bin:$PATH
|
72 |
+
|
73 |
+
WORKDIR $HOME
|
74 |
+
RUN mkdir app
|
75 |
+
# Set the working directory to the user's home directory
|
76 |
+
WORKDIR $HOME/app
|
77 |
+
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
78 |
+
COPY --chown=user . $HOME/app
|
79 |
+
|
80 |
+
EXPOSE 7860
|
81 |
+
CMD streamlit run app.py \
|
82 |
+
--server.headless true \
|
83 |
+
--server.enableCORS false \
|
84 |
+
--server.enableXsrfProtection false \
|
85 |
+
--server.fileWatcherType none
|
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from sklearn.metrics.pairwise import paired_cosine_distances
|
6 |
+
from sklearn.preprocessing import normalize
|
7 |
+
from rolaser import RoLaserEncoder
|
8 |
+
|
9 |
+
laser_checkpoint = f"{os.environ['LASER']}/models/laser2.pt"
|
10 |
+
laser_vocab = f"{os.environ['LASER']}/models/laser2.cvocab"
|
11 |
+
laser_tokenizer = 'spm'
|
12 |
+
laser_model = RoLaserEncoder(model_path=laser_checkpoint, vocab=laser_vocab, tokenizer=laser_tokenizer)
|
13 |
+
|
14 |
+
rolaser_checkpoint = f"{os.environ['ROLASER']}/models/RoLASER/rolaser.pt"
|
15 |
+
rolaser_vocab = f"{os.environ['ROLASER']}/models/RoLASER/rolaser.cvocab"
|
16 |
+
rolaser_tokenizer = 'roberta'
|
17 |
+
rolaser_model = RoLaserEncoder(model_path=rolaser_checkpoint, vocab=rolaser_vocab, tokenizer=rolaser_tokenizer)
|
18 |
+
|
19 |
+
c_rolaser_checkpoint = f"{os.environ['ROLASER']}/models/c-RoLASER/c-rolaser.pt"
|
20 |
+
c_rolaser_vocab = f"{os.environ['ROLASER']}/models/c-RoLASER/c-rolaser.cvocab"
|
21 |
+
c_rolaser_tokenizer = 'char'
|
22 |
+
c_rolaser_model = RoLaserEncoder(model_path=c_rolaser_checkpoint, vocab=c_rolaser_vocab, tokenizer=c_rolaser_tokenizer)
|
23 |
+
|
24 |
+
def add_text_inputs(i):
|
25 |
+
col1, col2 = st.columns(2)
|
26 |
+
with col1:
|
27 |
+
text_input1 = st.text_input('Enter standard text here:', f'std{i}')
|
28 |
+
with col2:
|
29 |
+
text_input2 = st.text_input('Enter non-standard text here:', f'ugc{i}')
|
30 |
+
return text_input1, text_input2
|
31 |
+
|
32 |
+
def main():
|
33 |
+
st.title('Pairwise Cosine Distance Calculator')
|
34 |
+
|
35 |
+
num_pairs = st.sidebar.number_input('Number of Text Input Pairs', min_value=1, max_value=10, value=1)
|
36 |
+
|
37 |
+
std_text_inputs = []
|
38 |
+
ugc_text_inputs = []
|
39 |
+
for i in range(num_pairs):
|
40 |
+
pair = add_text_inputs(i)
|
41 |
+
std_text_inputs.append(pair[0])
|
42 |
+
ugc_text_inputs.append(pair[1])
|
43 |
+
|
44 |
+
if st.button('Add Text Input Pair'):
|
45 |
+
pair = add_text_inputs(len(std_text_inputs))
|
46 |
+
std_text_inputs.append(pair[0])
|
47 |
+
ugc_text_inputs.append(pair[1])
|
48 |
+
|
49 |
+
if st.button('Submit'):
|
50 |
+
X_std_laser = normalize(laser_model.encode(std_text_inputs))
|
51 |
+
X_ugc_laser = normalize(laser_model.encode(ugc_text_inputs))
|
52 |
+
X_cos_laser = paired_cosine_distances(X_std_laser, X_ugc_laser)
|
53 |
+
|
54 |
+
X_std_rolaser = normalize(rolaser_model.encode(std_text_inputs))
|
55 |
+
X_ugc_rolaser = normalize(rolaser_model.encode(ugc_text_inputs))
|
56 |
+
X_cos_rolaser = paired_cosine_distances(X_std_rolaser, X_ugc_rolaser)
|
57 |
+
|
58 |
+
X_std_c_rolaser = normalize(c_rolaser_model.encode(std_text_inputs))
|
59 |
+
X_ugc_c_rolaser = normalize(c_rolaser_model.encode(ugc_text_inputs))
|
60 |
+
X_cos_c_rolaser = paired_cosine_distances(X_std_c_rolaser, X_ugc_c_rolaser)
|
61 |
+
|
62 |
+
outputs = pd.DataFrame(columns=[ 'model', 'pair', 'ugc', 'std', 'cos'])
|
63 |
+
outputs['model'] = np.repeat(['LASER', 'RoLASER', 'C-RoLASER'], 3)
|
64 |
+
outputs['pair'] = np.tile(np.arange(1,num_pairs+1), 3)
|
65 |
+
outputs['std'] = np.tile(std_text_inputs, 3)
|
66 |
+
outputs['ugc'] = np.tile(ugc_text_inputs, 3)
|
67 |
+
outputs['cos'] = np.concatenate([X_cos_laser, X_cos_rolaser, X_cos_c_rolaser], axis=1)
|
68 |
+
|
69 |
+
st.write('## Cosine Distance Scores:')
|
70 |
+
st.bar_chart(outputs, x='pair', y='cos', color='model', title='Cosine Distance Scores', xlabel='Text Input Pair', ylabel='Cosine Distance', legend='Model')
|
71 |
+
|
72 |
+
st.write('## Average Cosine Distance Scores:')
|
73 |
+
st.write(f'LASER: {outputs[outputs["model"]=="LASER"]["cos"].mean()}')
|
74 |
+
st.write(f'RoLASER: {outputs[outputs["model"]=="RoLASER"]["cos"].mean()}')
|
75 |
+
st.write(f'C-RoLASER: {outputs[outputs["model"]=="C-RoLASER"]["cos"].mean()}')
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
main()
|