silvanocerza commited on
Commit
0a4996a
1 Parent(s): 9c76d9f

Go back to fetching zip

Browse files
Files changed (2) hide show
  1. main.py +50 -19
  2. requirements.txt +1 -0
main.py CHANGED
@@ -1,7 +1,8 @@
1
  from typing import List, Tuple
2
  from pathlib import Path
3
- import subprocess
4
  import os
 
 
5
 
6
  from dotenv import load_dotenv
7
  from haystack.preview import Pipeline
@@ -18,45 +19,75 @@ from haystack.preview.components.writers import DocumentWriter
18
  from haystack.preview.components.file_converters import TextFileToDocument
19
  from haystack.preview.document_stores.memory import MemoryDocumentStore
20
  import streamlit as st
 
21
 
22
  # Load the environment variables, we're going to need it for OpenAI
23
  load_dotenv()
24
 
25
  # This is the list of documentation that we're going to fetch
26
  DOCUMENTATIONS = [
27
- ("DocArray", "https://github.com/docarray/docarray", "./docs/**/*.md"),
28
- ("Streamlit", "https://github.com/streamlit/docs", "./content/**/*.md"),
29
- ("Jinja", "https://github.com/pallets/jinja", "./docs/**/*.rst"),
30
- ("Pandas", "https://github.com/pandas-dev/pandas", "./doc/source/**/*.rst"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  (
32
  "Elasticsearch",
33
  "https://github.com/elastic/elasticsearch",
 
34
  "./docs/**/*.asciidoc",
35
  ),
36
- ("NumPy", "https://github.com/numpy/numpy", "./doc/**/*.rst"),
 
 
 
 
 
37
  ]
38
 
39
 
40
  @st.cache_data(show_spinner=False)
41
  def fetch(documentations: List[Tuple[str, str, str]]):
42
  files = []
43
- for name, url, pattern in documentations:
 
44
  st.write(f"Fetching {name} repository")
45
- repo = Path(__file__).parent / "downloaded_docs" / name
46
- if not repo.exists():
47
- subprocess.run(["git", "clone", "--depth", "1", url, str(repo)], check=True)
48
- res = subprocess.run(
49
- ["git", "rev-parse", "--abbrev-ref", "HEAD"],
50
- check=True,
51
- capture_output=True,
52
- encoding="utf-8",
53
- )
54
- branch = res.stdout.strip()
55
- for p in repo.glob(pattern):
 
 
56
  data = {
57
  "path": p,
58
  "metadata": {
59
- "url_source": f"{url}/tree/{branch}/{p.relative_to(repo)}",
60
  "suffix": p.suffix,
61
  },
62
  }
 
1
  from typing import List, Tuple
2
  from pathlib import Path
 
3
  import os
4
+ import zipfile
5
+ import io
6
 
7
  from dotenv import load_dotenv
8
  from haystack.preview import Pipeline
 
19
  from haystack.preview.components.file_converters import TextFileToDocument
20
  from haystack.preview.document_stores.memory import MemoryDocumentStore
21
  import streamlit as st
22
+ import requests
23
 
24
  # Load the environment variables, we're going to need it for OpenAI
25
  load_dotenv()
26
 
27
  # This is the list of documentation that we're going to fetch
28
  DOCUMENTATIONS = [
29
+ (
30
+ "DocArray",
31
+ "https://github.com/docarray/docarray",
32
+ "/archive/refs/heads/main.zip",
33
+ "./docs/**/*.md",
34
+ ),
35
+ (
36
+ "Streamlit",
37
+ "https://github.com/streamlit/docs",
38
+ "/archive/refs/heads/main.zip",
39
+ "./content/**/*.md",
40
+ ),
41
+ (
42
+ "Jinja",
43
+ "https://github.com/pallets/jinja",
44
+ "/archive/refs/heads/main.zip",
45
+ "./docs/**/*.rst",
46
+ ),
47
+ (
48
+ "Pandas",
49
+ "https://github.com/pandas-dev/pandas",
50
+ "/archive/refs/heads/main.zip",
51
+ "./doc/source/**/*.rst",
52
+ ),
53
  (
54
  "Elasticsearch",
55
  "https://github.com/elastic/elasticsearch",
56
+ "/archive/refs/heads/main.zip",
57
  "./docs/**/*.asciidoc",
58
  ),
59
+ (
60
+ "NumPy",
61
+ "https://github.com/numpy/numpy",
62
+ "/archive/refs/heads/main.zip",
63
+ "./doc/**/*.rst",
64
+ ),
65
  ]
66
 
67
 
68
  @st.cache_data(show_spinner=False)
69
  def fetch(documentations: List[Tuple[str, str, str]]):
70
  files = []
71
+ docs_path = Path(__file__).parent / "downloaded_docs"
72
+ for name, url, zip_path, pattern in documentations:
73
  st.write(f"Fetching {name} repository")
74
+ # All projects use `main` as the default branch
75
+ branch = "main"
76
+ # The name of the folder depends on the name of the repository
77
+ # on GitHub plus the branch zip we're downloading
78
+ repo_folder = docs_path / (url.split("/")[-1] + f"-{branch}")
79
+ if not repo_folder.exists():
80
+ res = requests.get(f"{url}{zip_path}", stream=True)
81
+ zip = zipfile.ZipFile(io.BytesIO(res.content))
82
+ # The zip file contains a folder with the name of the repository
83
+ # so we extract directly into the docs folder
84
+ zip.extractall(docs_path)
85
+
86
+ for p in repo_folder.glob(pattern):
87
  data = {
88
  "path": p,
89
  "metadata": {
90
+ "url_source": f"{url}/tree/{branch}/{p.relative_to(repo_folder)}",
91
  "suffix": p.suffix,
92
  },
93
  }
requirements.txt CHANGED
@@ -4,3 +4,4 @@ langdetect
4
  streamlit==1.27.2
5
  python-dotenv
6
  watchdog
 
 
4
  streamlit==1.27.2
5
  python-dotenv
6
  watchdog
7
+ requests