dwb2023 commited on
Commit
86f4186
1 Parent(s): 72394b9

Update file_utils.py

Browse files
Files changed (1) hide show
  1. file_utils.py +49 -0
file_utils.py CHANGED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from magika import Magika
3
+ from transformers import pipeline
4
+
5
+ # Initialize the summarization pipeline
6
+ summarizer = pipeline("summarization")
7
+
8
+ SUPPORTED_FILE_TYPES = ["txt", "shell", "python", "markdown", "yaml", "json", "csv", "tsv", "xml", "html", "ini", "jsonl", "ipynb"]
9
+
10
+ def validate_file_types(directory):
11
+ m = Magika()
12
+ file_types = {}
13
+ for root, _, files in os.walk(directory):
14
+ if '.git' in root:
15
+ continue
16
+ for file_name in files:
17
+ file_path = os.path.join(root, file_name)
18
+ try:
19
+ with open(file_path, 'rb') as file:
20
+ file_bytes = file.read()
21
+ result = m.identify_bytes(file_bytes)
22
+ file_types[file_path] = result.output.ct_label
23
+ except Exception as e:
24
+ file_types[file_path] = f"Error: {str(e)}"
25
+ return file_types
26
+
27
+ def get_file_summary(file_path, file_type):
28
+ size = os.path.getsize(file_path)
29
+ return {
30
+ "name": os.path.relpath(file_path),
31
+ "type": file_type,
32
+ "size": size,
33
+ "creation_date": os.path.getctime(file_path),
34
+ "modification_date": os.path.getmtime(file_path)
35
+ }
36
+
37
+ def read_file_content(file_path, max_size=32*1024):
38
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
39
+ content = file.read()
40
+ if len(content) > max_size:
41
+ return content[:max_size] + "\n... [Content Truncated] ..."
42
+ else:
43
+ return content
44
+
45
+ def summarize_content(content):
46
+ max_chunk_size = 1000 # max input size for the summarization model
47
+ chunks = [content[i:i + max_chunk_size] for i in range(0, len(content), max_chunk_size)]
48
+ summaries = [summarizer(chunk)[0]['summary_text'] for chunk in chunks]
49
+ return " ".join(summaries)