dwb2023 commited on
Commit
6c65400
1 Parent(s): ee66c5b

Update file_utils.py

Browse files
Files changed (1) hide show
  1. file_utils.py +44 -38
file_utils.py CHANGED
@@ -1,43 +1,49 @@
1
  import os
2
- from magika import Magika
3
- import datetime
4
 
5
- def validate_file_types(directory):
6
- m = Magika()
7
- file_types = {}
8
- for root, _, files in os.walk(directory):
9
- if '.git' in root:
10
- continue
11
- for file_name in files:
12
- file_path = os.path.join(root, file_name)
13
- try:
14
- with open(file_path, 'rb') as file:
15
- file_bytes = file.read()
16
- result = m.identify_bytes(file_bytes)
17
- file_types[file_path] = result.output.ct_label
18
- except Exception as e:
19
- file_types[file_path] = f"Error: {str(e)}"
20
- return file_types
21
 
22
- def get_file_summary(file_path, file_type):
23
- size = os.path.getsize(file_path)
24
- creation_date = datetime.datetime.utcfromtimestamp(os.path.getctime(file_path)).strftime('%Y-%m-%d %H:%M:%S UTC')
25
- modification_date = datetime.datetime.utcfromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S UTC')
26
- return {
27
- "name": os.path.relpath(file_path),
28
- "type": file_type,
29
- "size": size,
30
- "creation_date": creation_date,
31
- "modification_date": modification_date
32
- }
33
 
34
- def read_file_content(file_path, max_size=32*1024):
35
- with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
36
- if os.path.getsize(file_path) > max_size:
37
- return file.read(max_size) + "\n... [Content Truncated] ..."
38
- else:
39
- return file.read()
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- def summarize_content(content):
42
- # Placeholder for summarization logic
43
- pass
 
1
  import os
2
+ import gradio as gr
3
+ from repo_utils import extract_repo_content
4
 
5
+ def format_output(extracted_content, repo_url):
6
+ formatted_output = f"# Repository URL: {repo_url}\n\n"
7
+ for file_data in extracted_content:
8
+ if isinstance(file_data, dict) and 'header' in file_data:
9
+ formatted_output += f"### File: {file_data['header']['name']}\n"
10
+ formatted_output += f"**Type:** {file_data['header']['type']}\n"
11
+ formatted_output += f"**Size:** {file_data['header']['size']} bytes\n"
12
+ formatted_output += f"**Created:** {file_data['header']['creation_date']}\n"
13
+ formatted_output += f"**Modified:** {file_data['header']['modification_date']}\n"
14
+ formatted_output += "#### Content:\n"
15
+ formatted_output += f"```\n{file_data['content']}\n```\n\n"
16
+ else:
17
+ formatted_output += "Error in file data format.\n"
18
+ return formatted_output
 
 
19
 
20
+ def extract_and_display(url):
21
+ hf_token = os.getenv("HF_TOKEN")
22
+ hf_user = os.getenv("SPACE_AUTHOR_NAME")
23
+ if not hf_token or not hf_user:
24
+ return "Error: HF_TOKEN or SPACE_AUTHOR_NAME environment variable is not set."
25
+ extracted_content = extract_repo_content(url, hf_token, hf_user)
26
+ formatted_output = format_output(extracted_content, url)
27
+ return formatted_output
 
 
 
28
 
29
+ app = gr.Blocks(theme="sudeepshouche/minimalist")
30
+
31
+ with app:
32
+ gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
33
+ url_input = gr.Textbox(label="https:// URL of Repository", placeholder="Enter the repository URL here OR select an example below...")
34
+ url_examples = gr.Examples(
35
+ examples=[
36
+ ["https://huggingface.co/spaces/big-vision/paligemma-hf"],
37
+ ["https://huggingface.co/google/paligemma-3b-mix-224"],
38
+ ["https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"],
39
+ ["https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"]
40
+ ],
41
+ inputs=url_input
42
+ )
43
+ output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than 32 kb\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. 🤖")
44
+ extract_button = gr.Button("Extract Content")
45
+
46
+ extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
47
 
48
+ if __name__ == "__main__":
49
+ app.launch()