Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

= commited on Jun 3

Commit

fd342b4

1 Parent(s): 0d00a0e

Add compression and decompression functions for fact check data; update dependencies and remove obsolete files

Files changed (8) hide show

.gitattributes CHANGED Viewed

@@ -37,3 +37,4 @@ data/filtered_fact_check_latest_embed.csv filter=lfs diff=lfs merge=lfs -text
 data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
 *.csv filter=lfs diff=lfs merge=lfs -text
 *.json filter=lfs diff=lfs merge=lfs -text

 data/random_300k.csv filter=lfs diff=lfs merge=lfs -text
 *.csv filter=lfs diff=lfs merge=lfs -text
 *.json filter=lfs diff=lfs merge=lfs -text
+data/fc_latest_maxi_compr filter=lfs diff=lfs merge=lfs -text

Manifest.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 julia_version = "1.10.4"
 manifest_format = "2.0"
-project_hash = "1ce95d4f8f4617f58a3df72191590f2e35b92b89"
 [[deps.AbstractTrees]]
 git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"

 julia_version = "1.10.4"
 manifest_format = "2.0"
+project_hash = "071291b10413261c56b71962d94f340814c6f62c"
 [[deps.AbstractTrees]]
 git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"

Project.toml CHANGED Viewed

@@ -5,6 +5,7 @@ version = "1.0.0-DEV"
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

data/fc_latest_maxi_compr ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfec2c2ec6ed5b0d4df6bc91838a72cd4a87db7d66c6f89245d6534557973e27
+size 341251717

scripts/UpdateHuggingFaceAPI.jl CHANGED Viewed

@@ -33,3 +33,7 @@ narrs = narrs[.!ismissing.(narrs.text), :]
 narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
 narrs.Embeddings = narratives_embed
 CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)

 narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
 narrs.Embeddings = narratives_embed
 CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
+# Compress the fact check data
+OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr")
+# Delete the original
+rm("data/fact_check_latest_embed_maxi.csv")

server.jl CHANGED Viewed

@@ -3,7 +3,12 @@ using HTTP
 import OstreaCultura as OC
 # Load the fasttext embeddings and the fasttext model
-const (fc_embed, fc) = OC.load_fasttext_embeddings("data/fact_check_latest_embed_maxi.csv")
 const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
 @get "/greet" function(req::HTTP.Request)

 import OstreaCultura as OC
 # Load the fasttext embeddings and the fasttext model
+tmp_destination = tempname()
+# Decompress the fact check data
+OC.decompress_csv("data/fc_latest_maxi_compr", tmp_destination)
+#####
+const (fc_embed, fc) = OC.load_fasttext_embeddings(tmp_destination)
 const (nar_embed, nar) = OC.load_fasttext_embeddings("data/expansive_claims_library_expanded_embed_maxi.csv")
 @get "/greet" function(req::HTTP.Request)

src/OstreaCultura.jl CHANGED Viewed

@@ -5,6 +5,8 @@ module OstreaCultura
 using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
 import Pandas.DataFrame as pdataframe
 export MiniEncoder
@@ -20,6 +22,7 @@ export MiniEncoder
 include("py_init.jl")
 include("Embeddings.jl")
 include("PyPineCone.jl")
 #include("Models.jl")
 end

 using JSON3, Dates, Sqids, CSV, DataFrames, StatsBase, Distances, PyCall
+using CodecZlib
 import Pandas.DataFrame as pdataframe
 export MiniEncoder
 include("py_init.jl")
 include("Embeddings.jl")
 include("PyPineCone.jl")
+include("compress.jl")
 #include("Models.jl")
 end

src/compress.jl ADDED Viewed

+#using CodecZlib
+#using CSV
+#using DataFrames
+function compress_csv(input_path::String, output_path::String=input_path * ".gz")
+    println("Compressing $input_path to $output_path...")
+    open(input_path, "r") do input_io
+        open(output_path, "w") do output_io
+            stream = GzipCompressorStream(output_io)
+            write(stream, read(input_io))
+            close(stream)
+        end
+    end
+    # Calculate compression ratio
+    original_size = filesize(input_path)
+    compressed_size = filesize(output_path)
+    ratio = (1 - compressed_size / original_size) * 100
+    println("Compression complete: $(round(original_size / 1024^2, digits=2)) MB → $(round(compressed_size / 1024^2, digits=2)) MB ($(round(ratio, digits=1))% reduction)")
+    return output_path
+end
+function decompress_csv(input_path::String, output_path::String)
+    println("Decompressing $input_path to $output_path...")
+    open(input_path, "r") do input_io
+        open(output_path, "w") do output_io
+            stream = GzipDecompressorStream(input_io)
+            write(output_io, read(stream))
+            close(stream)
+        end
+    end
+    println("Decompression complete!")
+    return output_path
+end