narugo1992 commited on
Commit
5ff92c3
0 Parent(s):

Duplicate from narugo/jupyterlab_crawl_nikke

Browse files
Files changed (8) hide show
  1. .bashrc +1 -0
  2. .gitattributes +34 -0
  3. .gitignore +1 -0
  4. Dockerfile +41 -0
  5. README.md +13 -0
  6. requirements.txt +3 -0
  7. run.sh +15 -0
  8. test_crawl.py +66 -0
.bashrc ADDED
@@ -0,0 +1 @@
 
 
1
+ export PATH=$HOME/.local/bin:$PATH
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv
Dockerfile ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8.1
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN apt-get update && \
8
+ apt-get install -y sudo tmux wget curl htop make tree nano && \
9
+ apt-get install -y iputils-ping telnet && \
10
+ apt-get install -y ffmpeg libsm6 libxext6 && \
11
+ apt-get install -y git git-lfs
12
+
13
+ RUN --mount=type=secret,id=PASSWORD,mode=0444,required=true \
14
+ useradd -m -u 1000 user && \
15
+ echo "user:$(cat /run/secrets/PASSWORD)" | chpasswd && \
16
+ adduser user sudo
17
+
18
+ RUN pip install -U pip pysocks
19
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
20
+
21
+ USER user
22
+ ENV HOME=/home/user
23
+ ENV PATH=$HOME/.local/bin:$PATH
24
+ ENV SHELL=/bin/bash
25
+
26
+ WORKDIR $HOME
27
+
28
+ COPY --chown=user . $HOME/app
29
+
30
+ COPY .bashrc $HOME/.bashrc_append
31
+ RUN cat $HOME/.bashrc_append >> $HOME/.bashrc && \
32
+ rm $HOME/.bashrc_append
33
+
34
+ RUN git clone https://github.com/deepghs/cyberharem.git $HOME/cyberharem && \
35
+ pip install -r $HOME/cyberharem/requirements.txt && \
36
+ pip install onnxruntime
37
+ RUN cp $HOME/app/test_crawl.py $HOME/cyberharem
38
+
39
+ EXPOSE 7860
40
+ ENTRYPOINT []
41
+ CMD ["/bin/bash", "./app/run.sh"]
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: JupyterLab
3
+ emoji: 💹
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ app_port: 7860
10
+ duplicated_from: narugo/jupyterlab_crawl_nikke
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ jupyterlab==3.6.1
2
+ jupyter-server==2.3.0
3
+ tornado==6.2
run.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ PASSWORD="${PASSWORD:=huggingface}"
4
+ echo "Starting Jupyter Lab with token $PASSWORD"
5
+
6
+ jupyter lab \
7
+ --ip=0.0.0.0 \
8
+ --port=7860 \
9
+ --no-browser \
10
+ --allow-root \
11
+ --NotebookApp.token=$PASSWORD \
12
+ --NotebookApp.tornado_settings="{'headers': {'Content-Security-Policy': 'frame-ancestors *'}}" \
13
+ --NotebookApp.cookie_options="{'SameSite': 'None', 'Secure': True}" \
14
+ --NotebookApp.disable_check_xsrf=True
15
+
test_crawl.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from concurrent.futures import ThreadPoolExecutor
3
+
4
+ from ditk import logging
5
+ from gchar.games.azurlane import Character
6
+ from gchar.resources.pixiv import get_pixiv_posts
7
+ from tqdm.auto import tqdm
8
+ from waifuc.utils import task_ctx
9
+
10
+ from cyberharem.dataset import crawl_dataset_to_huggingface
11
+ from cyberharem.utils import get_hf_fs
12
+
13
+
14
+ def _cht(ch: Character):
15
+ name = str(ch.enname or ch.cnname or ch.jpname)
16
+ short_name = re.sub(r'[\W_]+', '_', name).strip('_')
17
+ return f'{short_name}_{ch.__game_name__}'
18
+
19
+
20
+ def _get_pixiv_posts(ch: Character):
21
+ v = get_pixiv_posts(ch)
22
+ return 0 if v is None else v[0]
23
+
24
+
25
+ hf_fs = get_hf_fs()
26
+ # all_chs = [ch for ch in Character.all(contains_extra=False) if not ch.is_extra and ch.accessible and ch.index >= 153]
27
+ all_chs = Character.all(contains_extra=False)
28
+ all_chs = sorted(all_chs, key=lambda x: -_get_pixiv_posts(x))[2::3]
29
+ pg = tqdm(total=len(all_chs))
30
+ crawled_ids = set()
31
+
32
+
33
+
34
+ def _crawl(char_):
35
+ try:
36
+ repo = f'CyberHarem/{_cht(char_)}'
37
+ if char_.gender == 'female':
38
+ if char_.index not in crawled_ids and \
39
+ not hf_fs.exists(f'datasets/{repo}/dataset-raw.zip'):
40
+ with task_ctx(repo):
41
+ crawl_dataset_to_huggingface(
42
+ char_, repository=repo, limit=200
43
+ )
44
+ crawled_ids.add(char_.index)
45
+ pg.update()
46
+ else:
47
+ logging.info(f'{char_!r} already crawled, skipped.')
48
+ pg.update()
49
+ else:
50
+ logging.info(f'{char_!r} is not female, skipped!')
51
+ pg.update()
52
+ except Exception as err:
53
+ logging.error(repr(err))
54
+ raise
55
+
56
+
57
+ if __name__ == '__main__':
58
+ logging.try_init_root(logging.INFO)
59
+ tp = ThreadPoolExecutor(max_workers=1)
60
+
61
+ for ch in all_chs:
62
+ # _crawl(ch)
63
+ tp.submit(_crawl, ch)
64
+
65
+ tp.shutdown()
66
+