Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	
		=
		
	commited on
		
		
					Commit 
							
							·
						
						fd342b4
	
1
								Parent(s):
							
							0d00a0e
								
Add compression and decompression functions for fact check data; update dependencies and remove obsolete files
Browse files- .gitattributes +1 -0
 - Manifest.toml +1 -1
 - Project.toml +1 -0
 - data/fc_latest_maxi_compr +3 -0
 - scripts/UpdateHuggingFaceAPI.jl +4 -0
 - server.jl +6 -1
 - src/OstreaCultura.jl +3 -0
 - src/compress.jl +39 -0
 
    	
        .gitattributes
    CHANGED
    
    | 
         @@ -37,3 +37,4 @@ data/filtered_fact_check_latest_embed.csv filter=lfs diff=lfs merge=lfs -text 
     | 
|
| 37 | 
         
             
            data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
         
     | 
| 38 | 
         
             
            *.csv filter=lfs diff=lfs merge=lfs -text
         
     | 
| 39 | 
         
             
            *.json filter=lfs diff=lfs merge=lfs -text
         
     | 
| 
         | 
| 
         | 
|
| 37 | 
         
             
            data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
         
     | 
| 38 | 
         
             
            *.csv filter=lfs diff=lfs merge=lfs -text
         
     | 
| 39 | 
         
             
            *.json filter=lfs diff=lfs merge=lfs -text
         
     | 
| 40 | 
         
            +
            data/fc_latest_maxi_compr filter=lfs diff=lfs merge=lfs -text
         
     | 
    	
        Manifest.toml
    CHANGED
    
    | 
         @@ -2,7 +2,7 @@ 
     | 
|
| 2 | 
         | 
| 3 | 
         
             
            julia_version = "1.10.4"
         
     | 
| 4 | 
         
             
            manifest_format = "2.0"
         
     | 
| 5 | 
         
            -
            project_hash = " 
     | 
| 6 | 
         | 
| 7 | 
         
             
            [[deps.AbstractTrees]]
         
     | 
| 8 | 
         
             
            git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
         
     | 
| 
         | 
|
| 2 | 
         | 
| 3 | 
         
             
            julia_version = "1.10.4"
         
     | 
| 4 | 
         
             
            manifest_format = "2.0"
         
     | 
| 5 | 
         
            +
            project_hash = "071291b10413261c56b71962d94f340814c6f62c"
         
     | 
| 6 | 
         | 
| 7 | 
         
             
            [[deps.AbstractTrees]]
         
     | 
| 8 | 
         
             
            git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
         
     | 
    	
        Project.toml
    CHANGED
    
    | 
         @@ -5,6 +5,7 @@ version = "1.0.0-DEV" 
     | 
|
| 5 | 
         | 
| 6 | 
         
             
            [deps]
         
     | 
| 7 | 
         
             
            CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
         
     | 
| 
         | 
|
| 8 | 
         
             
            DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
         
     | 
| 9 | 
         
             
            Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
         
     | 
| 10 | 
         
             
            Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
         
     | 
| 
         | 
|
| 5 | 
         | 
| 6 | 
         
             
            [deps]
         
     | 
| 7 | 
         
             
            CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
         
     | 
| 8 | 
         
            +
            CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
         
     | 
| 9 | 
         
             
            DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
         
     | 
| 10 | 
         
             
            Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
         
     | 
| 11 | 
         
             
            Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
         
     | 
    	
        data/fc_latest_maxi_compr
    ADDED
    
    | 
         @@ -0,0 +1,3 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            version https://git-lfs.github.com/spec/v1
         
     | 
| 2 | 
         
            +
            oid sha256:bfec2c2ec6ed5b0d4df6bc91838a72cd4a87db7d66c6f89245d6534557973e27
         
     | 
| 3 | 
         
            +
            size 341251717
         
     | 
    	
        scripts/UpdateHuggingFaceAPI.jl
    CHANGED
    
    | 
         @@ -33,3 +33,7 @@ narrs = narrs[.!ismissing.(narrs.text), :] 
     | 
|
| 33 | 
         
             
            narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run 
         
     | 
| 34 | 
         
             
            narrs.Embeddings = narratives_embed
         
     | 
| 35 | 
         
             
            CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 33 | 
         
             
            narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run 
         
     | 
| 34 | 
         
             
            narrs.Embeddings = narratives_embed
         
     | 
| 35 | 
         
             
            CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
         
     | 
| 36 | 
         
            +
            # Compress the fact check data
         
     | 
| 37 | 
         
            +
            OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr")
         
     | 
| 38 | 
         
            +
            # Delete the original 
         
     | 
| 39 | 
         
            +
            rm("data/fact_check_latest_embed_maxi.csv")
         
     | 
    	
        server.jl
    CHANGED
    
    | 
         @@ -3,7 +3,12 @@ using HTTP 
     | 
|
| 3 | 
         
             
            import OstreaCultura as OC
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            # Load the fasttext embeddings and the fasttext model
         
     | 
| 6 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 7 | 
         
             
            const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
         
     | 
| 8 | 
         | 
| 9 | 
         
             
            @get "/greet" function(req::HTTP.Request)
         
     | 
| 
         | 
|
| 3 | 
         
             
            import OstreaCultura as OC
         
     | 
| 4 | 
         | 
| 5 | 
         
             
            # Load the fasttext embeddings and the fasttext model
         
     | 
| 6 | 
         
            +
            tmp_destination = tempname()
         
     | 
| 7 | 
         
            +
            # Decompress the fact check data 
         
     | 
| 8 | 
         
            +
            OC.decompress_csv("data/fc_latest_maxi_compr", tmp_destination)
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
            +
            #####
         
     | 
| 11 | 
         
            +
            const (fc_embed, fc) = OC.load_fasttext_embeddings(tmp_destination)
         
     | 
| 12 | 
         
             
            const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
         
     | 
| 13 | 
         | 
| 14 | 
         
             
            @get "/greet" function(req::HTTP.Request)
         
     | 
    	
        src/OstreaCultura.jl
    CHANGED
    
    | 
         @@ -5,6 +5,8 @@ module OstreaCultura 
     | 
|
| 5 | 
         | 
| 6 | 
         
             
            using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
         
     | 
| 7 | 
         | 
| 
         | 
|
| 
         | 
|
| 8 | 
         
             
            import Pandas.DataFrame as pdataframe
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            export MiniEncoder
         
     | 
| 
         @@ -20,6 +22,7 @@ export MiniEncoder 
     | 
|
| 20 | 
         
             
            include("py_init.jl")
         
     | 
| 21 | 
         
             
            include("Embeddings.jl")
         
     | 
| 22 | 
         
             
            include("PyPineCone.jl")
         
     | 
| 
         | 
|
| 23 | 
         
             
            #include("Models.jl")
         
     | 
| 24 | 
         | 
| 25 | 
         
             
            end
         
     | 
| 
         | 
|
| 5 | 
         | 
| 6 | 
         
             
            using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
         
     | 
| 7 | 
         | 
| 8 | 
         
            +
            using CodecZlib
         
     | 
| 9 | 
         
            +
             
     | 
| 10 | 
         
             
            import Pandas.DataFrame as pdataframe
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            export MiniEncoder
         
     | 
| 
         | 
|
| 22 | 
         
             
            include("py_init.jl")
         
     | 
| 23 | 
         
             
            include("Embeddings.jl")
         
     | 
| 24 | 
         
             
            include("PyPineCone.jl")
         
     | 
| 25 | 
         
            +
            include("compress.jl")
         
     | 
| 26 | 
         
             
            #include("Models.jl")
         
     | 
| 27 | 
         | 
| 28 | 
         
             
            end
         
     | 
    	
        src/compress.jl
    ADDED
    
    | 
         @@ -0,0 +1,39 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            #using CodecZlib
         
     | 
| 2 | 
         
            +
            #using CSV
         
     | 
| 3 | 
         
            +
            #using DataFrames
         
     | 
| 4 | 
         
            +
             
     | 
| 5 | 
         
            +
            function compress_csv(input_path::String, output_path::String=input_path * ".gz")
         
     | 
| 6 | 
         
            +
                println("Compressing $input_path to $output_path...")
         
     | 
| 7 | 
         
            +
                
         
     | 
| 8 | 
         
            +
                open(input_path, "r") do input_io
         
     | 
| 9 | 
         
            +
                    open(output_path, "w") do output_io
         
     | 
| 10 | 
         
            +
                        stream = GzipCompressorStream(output_io)
         
     | 
| 11 | 
         
            +
                        write(stream, read(input_io))
         
     | 
| 12 | 
         
            +
                        close(stream)
         
     | 
| 13 | 
         
            +
                    end
         
     | 
| 14 | 
         
            +
                end
         
     | 
| 15 | 
         
            +
                
         
     | 
| 16 | 
         
            +
                # Calculate compression ratio
         
     | 
| 17 | 
         
            +
                original_size = filesize(input_path)
         
     | 
| 18 | 
         
            +
                compressed_size = filesize(output_path)
         
     | 
| 19 | 
         
            +
                ratio = (1 - compressed_size / original_size) * 100
         
     | 
| 20 | 
         
            +
                
         
     | 
| 21 | 
         
            +
                println("Compression complete: $(round(original_size / 1024^2, digits=2)) MB → $(round(compressed_size / 1024^2, digits=2)) MB ($(round(ratio, digits=1))% reduction)")
         
     | 
| 22 | 
         
            +
                return output_path
         
     | 
| 23 | 
         
            +
            end
         
     | 
| 24 | 
         
            +
             
     | 
| 25 | 
         
            +
            function decompress_csv(input_path::String, output_path::String)
         
     | 
| 26 | 
         
            +
                println("Decompressing $input_path to $output_path...")
         
     | 
| 27 | 
         
            +
                
         
     | 
| 28 | 
         
            +
                open(input_path, "r") do input_io
         
     | 
| 29 | 
         
            +
                    open(output_path, "w") do output_io
         
     | 
| 30 | 
         
            +
                        stream = GzipDecompressorStream(input_io)
         
     | 
| 31 | 
         
            +
                        write(output_io, read(stream))
         
     | 
| 32 | 
         
            +
                        close(stream)
         
     | 
| 33 | 
         
            +
                    end
         
     | 
| 34 | 
         
            +
                end
         
     | 
| 35 | 
         
            +
                
         
     | 
| 36 | 
         
            +
                println("Decompression complete!")
         
     | 
| 37 | 
         
            +
                return output_path
         
     | 
| 38 | 
         
            +
            end
         
     | 
| 39 | 
         
            +
             
     |