| |
|
| |
|
| | #include <stdio.h> |
| | #include <fcntl.h> |
| | #include <stdlib.h> |
| | #include <unistd.h> |
| | #include <string.h> |
| |
|
| |
|
| |
|
| |
|
| |
|
| | #include "tokenizer.h" |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | uint32_t get_file_size(const char *filename) { |
| | FILE *file = fopen(filename, "rb"); |
| | if (!file) { |
| | perror("fopen"); |
| | exit(EXIT_FAILURE); |
| | } |
| |
|
| | if (fseek(file, 0, SEEK_END)!= 0) { |
| | perror("fseek"); |
| | exit(EXIT_FAILURE); |
| | } |
| |
|
| | uint32_t size = ftell(file); |
| | if (size == -1) { |
| | perror("ftell"); |
| | exit(EXIT_FAILURE); |
| | } |
| |
|
| | fclose(file); |
| | return size; |
| | } |
| |
|
| |
|
| | int main(int argc, char *argv[]) { |
| | if(argc < 3) { |
| | printf("Needs at least 2 arguments\n"); |
| | return 1; |
| | } |
| |
|
| | char *fname[2] = {argv[1], argv[2]}; |
| | int fd[2] = { |
| | open(fname[0], O_RDONLY), |
| | open(fname[1], O_CREAT | O_WRONLY, 0644) |
| | }; |
| | |
| | printf("Vocab size is %d\n", tokenizer.get_vocab_size()); |
| | printf("File names: %s, %s\n", fname[0], fname[1]); |
| | printf("Got file descriptor: %d, %d\n", fd[0], fd[1]); |
| |
|
| | uint32_t size = get_file_size(fname[0]); |
| | uint8_t *data = (uint8_t *) malloc(size * sizeof(uint8_t)); |
| | uint32_t n = read(fd[0], data, size * sizeof(uint8_t)); |
| |
|
| | printf("Expected file size of %d bytes\n", size); |
| | printf("Read %d bytes\n", n); |
| | if (size != n) perror(0); |
| |
|
| | uint8_t *p = data; |
| | uint32_t d = 0; |
| | for (int i = 0; *p; ++i) { |
| | uint16_t tok = tokenizer.encode(&tokenizer, &p); |
| | write(fd[1], &tok, sizeof(uint16_t)); |
| | |
| | if (i % 10000 == 0) { |
| | printf("Current token %d | Index position %d | Remaining %d | Tokenization delta %ld\n", tok, i, d, d - ((data + n) - p)); |
| | d = (data + n) - p; |
| | } |
| |
|
| | } |
| |
|
| | printf("\n"); |
| | return 0; |
| | } |
| |
|