dwb2023 commited on
Commit
b7bbd70
1 Parent(s): 6b4bf17

Update file_utils.py

Browse files
Files changed (1) hide show
  1. file_utils.py +34 -26
file_utils.py CHANGED
@@ -1,30 +1,38 @@
1
- import gradio as gr
2
- from repo_utils import extract_repo_content
3
- from display_utils import format_output
4
 
5
- # Extract and display function
6
- def extract_and_display(url):
7
- extracted_content = extract_repo_content(url, hf_token, hf_user)
8
- formatted_output = format_output(extracted_content, url)
9
- return formatted_output
10
 
11
- app = gr.Blocks(theme="sudeepshouche/minimalist")
 
 
 
 
 
 
 
 
12
 
13
- with app:
14
- gr.Markdown("# Hugging Face Space / Model Repository Content Extractor")
15
- url_input = gr.Textbox(label="https:// URL of Repository", placeholder="Enter the repository URL here OR select an example below...")
16
- url_examples = gr.Examples(
17
- examples=[
18
- ["https://huggingface.co/spaces/big-vision/paligemma-hf"],
19
- ["https://huggingface.co/google/paligemma-3b-mix-224"],
20
- ["https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"],
21
- ["https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf"]
22
- ],
23
- inputs=url_input
24
- )
25
- output_display = gr.Textbox(label="Extracted Repository Content", show_copy_button=True, lines=20, placeholder="Repository content will be extracted here...\n\nMetadata is captured for all files, but text content provided only for files less than 32 kb\n\n\n\nReview and search through the content here OR simply copy it for offline analysis!!. 🤖")
26
- extract_button = gr.Button("Extract Content")
27
-
28
- extract_button.click(fn=extract_and_display, inputs=url_input, outputs=output_display)
29
 
30
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from magika import Magika
 
3
 
4
+ SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
 
 
 
 
5
 
6
+ def get_file_summary(file_path, file_type):
7
+ size = os.path.getsize(file_path)
8
+ return {
9
+ "name": os.path.relpath(file_path),
10
+ "type": file_type,
11
+ "size": size,
12
+ "creation_date": os.path.getctime(file_path),
13
+ "modification_date": os.path.getmtime(file_path)
14
+ }
15
 
16
+ def read_file_content(file_path, max_size=32*1024):
17
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
18
+ if os.path.getsize(file_path) > max_size:
19
+ return file.read(max_size) + "\n... [Content Truncated] ..."
20
+ else:
21
+ return file.read()
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def validate_file_types(directory):
24
+ m = Magika()
25
+ file_types = {}
26
+ for root, _, files in os.walk(directory):
27
+ if '.git' in root:
28
+ continue
29
+ for file_name in files:
30
+ file_path = os.path.join(root, file_name)
31
+ try:
32
+ with open(file_path, 'rb') as file:
33
+ file_bytes = file.read()
34
+ result = m.identify_bytes(file_bytes)
35
+ file_types[file_path] = result.output.ct_label
36
+ except Exception as e:
37
+ file_types[file_path] = f"Error: {str(e)}"
38
+ return file_types