ajsbsd commited on
Commit
f52daa3
·
1 Parent(s): be81ee8

qwen.ai helper

Browse files
Files changed (8) hide show
  1. app.py +0 -0
  2. breakupText.py +42 -0
  3. chunkedCSVOutput.py +46 -0
  4. csvReader.py +15 -0
  5. dataSet.py +0 -0
  6. loadDataset.py +28 -0
  7. testToTraining.py +39 -0
  8. trainingText.py +75 -0
app.py CHANGED
File without changes
breakupText.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_to_training_csv.py
2
+
3
+ import sys
4
+ import csv
5
+
6
+ def main():
7
+ if len(sys.argv) < 2:
8
+ print("Usage: python text_to_training_csv.py <chunk_size>", file=sys.stderr)
9
+ sys.exit(1)
10
+
11
+ try:
12
+ chunk_size = int(sys.argv[1])
13
+ except ValueError:
14
+ print("Error: Chunk size must be an integer.", file=sys.stderr)
15
+ sys.exit(1)
16
+
17
+ # CSV writer setup
18
+ writer = csv.writer(sys.stdout)
19
+ writer.writerow(["id", "text"]) # Header row
20
+
21
+ id_counter = 1
22
+ buffer = ''
23
+
24
+ while True:
25
+ chunk = sys.stdin.read(chunk_size)
26
+ if not chunk:
27
+ break
28
+ buffer += chunk
29
+
30
+ # If we've reached or exceeded chunk_size, write and reset
31
+ if len(buffer) >= chunk_size:
32
+ writer.writerow([id_counter, buffer[:chunk_size]])
33
+ id_counter += 1
34
+ buffer = buffer[chunk_size:] # Remaining text
35
+
36
+ # Write any leftover text
37
+ if buffer:
38
+ writer.writerow([id_counter, buffer])
39
+ id_counter += 1
40
+
41
+ if __name__ == "__main__":
42
+ main()
chunkedCSVOutput.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_to_csv_chunks.py
2
+
3
+ import sys
4
+ import csv
5
+
6
+ def main():
7
+ if len(sys.argv) < 2:
8
+ print("Usage: python text_to_csv_chunks.py <chunk_size>", file=sys.stderr)
9
+ sys.exit(1)
10
+
11
+ try:
12
+ chunk_size = int(sys.argv[1])
13
+ except ValueError:
14
+ print("Error: Chunk size must be an integer.", file=sys.stderr)
15
+ sys.exit(1)
16
+
17
+ # Prepare CSV writer
18
+ writer = csv.writer(sys.stdout)
19
+ # Write header
20
+ writer.writerow(["Chunk", "Text"])
21
+
22
+ chunk_number = 1
23
+ line_buffer = []
24
+ line_count = 0
25
+
26
+ for line in sys.stdin:
27
+ line_buffer.append(line.rstrip('\n'))
28
+ line_count += 1
29
+
30
+ if line_count >= chunk_size:
31
+ # Join lines and write chunk
32
+ chunk_text = '\n'.join(line_buffer)
33
+ writer.writerow([chunk_number, chunk_text])
34
+
35
+ # Reset
36
+ chunk_number += 1
37
+ line_buffer = []
38
+ line_count = 0
39
+
40
+ # Write any remaining lines
41
+ if line_buffer:
42
+ chunk_text = '\n'.join(line_buffer)
43
+ writer.writerow([chunk_number, chunk_text])
44
+
45
+ if __name__ == "__main__":
46
+ main()
csvReader.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # csv_reader.py
2
+
3
+ import sys
4
+ import csv
5
+
6
+ def main():
7
+ if len(sys.argv) > 1:
8
+ print("Warning: This script ignores any command-line arguments.", file=sys.stderr)
9
+
10
+ reader = csv.reader(sys.stdin)
11
+ for row in reader:
12
+ print(f"Row: {row}")
13
+
14
+ if __name__ == "__main__":
15
+ main()
dataSet.py ADDED
File without changes
loadDataset.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import random
3
+
4
+ # Set a fixed seed for reproducibility
5
+
6
+ # Generate a random integer from 0 to 256 (inclusive)
7
+ random_number = random.randint(0, 256)
8
+
9
+ #print(random_number)
10
+ random.seed(random_number)
11
+
12
+ # Load the dataset
13
+ dataset = load_dataset("ajsbsd/14400")
14
+ train_dataset = dataset['train']
15
+
16
+ # Get total number of examples
17
+ total_examples = len(train_dataset)
18
+ print(f"Total examples in dataset: {total_examples}\n")
19
+
20
+ # Pick 5 unique random indices
21
+ random_indices = random.sample(range(total_examples), 5)
22
+
23
+ # Print the 5 random examples
24
+ for idx in random_indices:
25
+ example = train_dataset[idx]
26
+ print(f"--- Example (ID: {idx}) ---")
27
+ print(f"Chunk ID: {example['id']}")
28
+ print(f"Text:\n{example['text']}\n")
testToTraining.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_to_training_csv.py
2
+
3
+ import sys
4
+ import csv
5
+
6
+ def main():
7
+ if len(sys.argv) < 2:
8
+ print("Usage: python text_to_training_csv.py <chunk_size>", file=sys.stderr)
9
+ sys.exit(1)
10
+
11
+ try:
12
+ chunk_size = int(sys.argv[1])
13
+ except ValueError:
14
+ print("Error: Chunk size must be an integer.", file=sys.stderr)
15
+ sys.exit(1)
16
+
17
+ writer = csv.writer(sys.stdout)
18
+ writer.writerow(["id", "text"])
19
+
20
+ id_counter = 1
21
+ buffer = ''
22
+
23
+ while True:
24
+ chunk = sys.stdin.read(chunk_size)
25
+ if not chunk:
26
+ break
27
+ buffer += chunk
28
+
29
+ if len(buffer) >= chunk_size:
30
+ writer.writerow([id_counter, buffer[:chunk_size]])
31
+ id_counter += 1
32
+ buffer = buffer[chunk_size:]
33
+
34
+ # Write remaining buffer
35
+ if buffer:
36
+ writer.writerow([id_counter, buffer])
37
+
38
+ if __name__ == "__main__":
39
+ main()
trainingText.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!//home/aaron/gradio_test/bin/python
2
+ ### ✅ Example: Pull Random Records Based on Dataset Size
3
+ #
4
+ #Here’s a complete Python example using Hugging Face's `datasets` library:
5
+ #
6
+ from datasets import load_dataset
7
+ import random
8
+
9
+ # Set seed for reproducibility (optional)
10
+ random.seed(42)
11
+
12
+ # Load dataset from Hugging Face
13
+ dataset = load_dataset("ajsbsd/14400")
14
+ train_dataset = dataset["train"]
15
+
16
+ # Get total number of records
17
+ total_records = len(train_dataset)
18
+ print(f"Total records in dataset: {total_records}\n")
19
+
20
+ # Ask user how many random samples they want
21
+ #num_samples = int(input("How many random records would you like to see? "))
22
+ num_samples = total_records
23
+
24
+ # Ensure valid input
25
+ if num_samples <= 0 or num_samples > total_records:
26
+ print(f"Please enter a number between 1 and {total_records}.")
27
+ else:
28
+ # Generate random indices
29
+ random_indices = random.sample(range(total_records), num_samples)
30
+
31
+ # Print random records
32
+ for i, idx in enumerate(random_indices, 1):
33
+ record = train_dataset[idx]
34
+ print(f"--- Record #{i} (Index: {idx}) ---")
35
+ print(f"ID: {record['id']}")
36
+ print(f"Text:\n{record['text']}\n")
37
+
38
+ ### 🧠 What This Does
39
+ #
40
+ # Loads the dataset
41
+ # Gets the total number of records automatically
42
+ # Asks the user how many random entries they want to see
43
+ # Picks that many random rows and prints them
44
+ #
45
+ ### 🔁 Example Run
46
+ #
47
+ #Total records in dataset: 256
48
+ #
49
+ #How many random records would you like to see? 5
50
+ #
51
+ #--- Record #1 (Index: 203) ---
52
+ #ID: 204
53
+ #Text:
54
+ #It was the...
55
+ #
56
+ #--- Record #2 (Index: 15) ---
57
+ #ID: 16
58
+ #Text:
59
+ #The period...
60
+ #
61
+ #
62
+ ### 📌 Want to Do This Without User Input?
63
+ #
64
+ #You can hardcode the number of samples:
65
+ #
66
+ #num_samples = 5
67
+ #
68
+ #Or make it part of a function:
69
+ #
70
+ #
71
+ #def get_random_samples(dataset, num_samples):
72
+ # total = len(dataset)
73
+ # indices = random.sample(range(total), num_samples)
74
+ # return [dataset[i] for i in indices]
75
+