dwb2023 commited on
Commit
c881cad
1 Parent(s): 56dbc6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -108
app.py CHANGED
@@ -1,113 +1,8 @@
1
- import os
2
- import subprocess
3
  import gradio as gr
4
- from magika import Magika
5
- from huggingface_hub import login
6
-
7
- # Get the HF token and space author name from environment variables
8
- hf_token = os.getenv("HF_TOKEN")
9
- hf_user = os.getenv("SPACE_AUTHOR_NAME")
10
-
11
- if not hf_token:
12
- raise ValueError("HF_TOKEN environment variable is not set")
13
- if not hf_user:
14
- raise ValueError("SPACE_AUTHOR_NAME environment variable is not set")
15
-
16
- SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
17
-
18
- def validate_url(url):
19
- return url.startswith('https://')
20
-
21
- def clone_repo(url, repo_dir, hf_token, hf_user):
22
- env = os.environ.copy()
23
- env['GIT_LFS_SKIP_SMUDGE'] = '1'
24
- token_url = url.replace('https://', f'https://{hf_user}:{hf_token}@')
25
- result = subprocess.run(["git", "clone", token_url, repo_dir], env=env, capture_output=True, text=True)
26
- if result.returncode != 0:
27
- return False, result.stderr
28
- return True, None
29
-
30
- def get_file_summary(file_path, file_type):
31
- size = os.path.getsize(file_path)
32
- return {
33
- "name": os.path.relpath(file_path),
34
- "type": file_type,
35
- "size": size,
36
- "creation_date": os.path.getctime(file_path),
37
- "modification_date": os.path.getmtime(file_path)
38
- }
39
-
40
- def read_file_content(file_path, max_size=32*1024):
41
- with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
42
- if os.path.getsize(file_path) > max_size:
43
- return file.read(max_size) + "\n... [Content Truncated] ..."
44
- else:
45
- return file.read()
46
-
47
- def validate_file_types(directory):
48
- m = Magika()
49
- file_types = {}
50
- for root, _, files in os.walk(directory):
51
- if '.git' in root:
52
- continue
53
- for file_name in files:
54
- file_path = os.path.join(root, file_name)
55
- try:
56
- with open(file_path, 'rb') as file:
57
- file_bytes = file.read()
58
- result = m.identify_bytes(file_bytes)
59
- file_types[file_path] = result.output.ct_label
60
- except Exception as e:
61
- file_types[file_path] = f"Error: {str(e)}"
62
- return file_types
63
-
64
- def extract_repo_content(url, hf_token, hf_user):
65
- if not validate_url(url):
66
- return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": "Invalid URL"}]
67
-
68
- repo_dir = "./temp_repo"
69
- if os.path.exists(repo_dir):
70
- subprocess.run(["rm", "-rf", repo_dir])
71
-
72
- success, error = clone_repo(url, repo_dir, hf_token, hf_user)
73
- if not success:
74
- return [{"header": {"name": "Error", "type": "error", "size": 0}, "content": f"Failed to clone repository: {error}"}]
75
-
76
- file_types = validate_file_types(repo_dir)
77
- extracted_content = []
78
- for file_path, file_type in file_types.items():
79
- file_summary = get_file_summary(file_path, file_type)
80
- content = {"header": file_summary}
81
-
82
- if file_type in SUPPORTED_FILE_TYPES and file_summary["size"] <= 32 * 1024:
83
- try:
84
- content["content"] = read_file_content(file_path)
85
- except Exception as e:
86
- content["content"] = f"Failed to read file content: {str(e)}"
87
- else:
88
- content["content"] = "File too large or binary, content not captured."
89
-
90
- extracted_content.append(content)
91
-
92
- subprocess.run(["rm", "-rf", repo_dir])
93
-
94
- return extracted_content
95
-
96
- def format_output(extracted_content, repo_url):
97
- formatted_output = f"# Repository URL: {repo_url}\n\n"
98
- for file_data in extracted_content:
99
- if isinstance(file_data, dict) and 'header' in file_data:
100
- formatted_output += f"### File: {file_data['header']['name']}\n"
101
- formatted_output += f"**Type:** {file_data['header']['type']}\n"
102
- formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
103
- formatted_output += f"**Created:** {file_data['header']['creation_date']}\n"
104
- formatted_output += f"**Modified:** {file_data['header']['modification_date']}\n"
105
- formatted_output += "#### Content:\n"
106
- formatted_output += f"```\n{file_data['content']}\n```\n\n"
107
- else:
108
- formatted_output += "Error in file data format.\n"
109
- return formatted_output
110
 
 
111
  def extract_and_display(url):
112
  extracted_content = extract_repo_content(url, hf_token, hf_user)
113
  formatted_output = format_output(extracted_content, url)
 
 
 
1
  import gradio as gr
2
+ from repo_utils import extract_repo_content
3
+ from display_utils import format_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Extract and display function
6
  def extract_and_display(url):
7
  extracted_content = extract_repo_content(url, hf_token, hf_user)
8
  formatted_output = format_output(extracted_content, url)