diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..c739cf3dd687fbdde69b898a1befe9245a6738cb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +*.db filter=lfs diff=lfs merge=lfs -text +*.faiss filter=lfs diff=lfs merge=lfs -text +*.txt filter=lfs diff=lfs merge=lfs -text diff --git a/ETL/embeddings_base.ipynb b/ETL/embeddings_base.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..de2406e97507bf6ee22b0d2e8af0eb195e2fe960 --- /dev/null +++ b/ETL/embeddings_base.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from haystack.nodes import PreProcessor, EmbeddingRetriever\n", + "from haystack.document_stores import FAISSDocumentStore\n", + "from haystack.utils import convert_files_to_docs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocess Documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BLAB-Wiki" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessor = PreProcessor(\n", + " clean_empty_lines=True,\n", + " clean_whitespace=True,\n", + " clean_header_footer=False,\n", + " split_by=\"sentence\",\n", + " split_length=2,\n", + " split_overlap=1,\n", + " split_respect_sentence_boundary=False)\n", + "\n", + "all_docs = convert_files_to_docs(dir_path=\"./Fontes/Wiki_Pages/\")\n", + "docs_default = preprocessor.process(all_docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### QA Source" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# QA sentences\n", + "QA_path = \"./Fontes/QA_Base/\"\n", + "\n", + "train = pd.read_parquet(QA_path + 'train.parquet')['new_long_answers']\n", + "test = pd.read_parquet(QA_path + 'test.parquet')['new_long_answers']\n", + "validation = pd.read_parquet(QA_path + 'validation.parquet')['new_long_answers']\n", + "\n", + "answers = pd.concat([train,test,validation])\n", + "\n", + "docs_list = [{\"content\": v, \"content_type\": \"text\", \"score\":None, \"meta\":None} for i,v in answers.items()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create DocumentsStore and calculate Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "document_store = FAISSDocumentStore(similarity=\"dot_product\", embedding_dim=512)\n", + "document_store.write_documents(docs_default + docs_list)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "retriever = EmbeddingRetriever(\n", + " document_store=document_store, \n", + " embedding_model=\"sentence-transformers/distiluse-base-multilingual-cased-v1\")\n", + "\n", + "document_store.update_embeddings(retriever, batch_size=10000)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ETL/faiss_document_store.db b/ETL/faiss_document_store.db new file mode 100644 index 0000000000000000000000000000000000000000..d74d0eff8fc9fcf4de05b3e4ff3a982fd8229f8f --- /dev/null +++ b/ETL/faiss_document_store.db @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9485f1032e10bbd8003a214808ff3e3c8f27ff0f04918ec0f03ac68bbdfdb49b +size 112242688 diff --git a/Fontes/QA_Base/final_QA.txt b/Fontes/QA_Base/final_QA.txt new file mode 100644 index 0000000000000000000000000000000000000000..055ef7a601795484d7dd638cdaa238f5fa70b0e8 --- /dev/null +++ b/Fontes/QA_Base/final_QA.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4f5ea0cb0c4dfcc7b070eba9309e4e2e1b1c4e55374c2259beb82d9191d0c0 +size 65438790 diff --git a/Fontes/QA_Base/test.parquet b/Fontes/QA_Base/test.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9fe317296351bce8c1454fdddc6f4c9b46e4cf2f --- /dev/null +++ b/Fontes/QA_Base/test.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23c63f0b6d06385135eae98910bf2a5ef2fd91a4fdbf8d0bbde327b0333ce564 +size 5548810 diff --git a/Fontes/QA_Base/train.parquet b/Fontes/QA_Base/train.parquet new file mode 100644 index 0000000000000000000000000000000000000000..08efde4e7a6e0e3e19a137d2827426d2d76e8dde --- /dev/null +++ b/Fontes/QA_Base/train.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b53d5cea3123cd5b1ec22e4fa9d1b2979f371b84e38e98d0adf3092cf287fc +size 44653858 diff --git a/Fontes/QA_Base/validation.parquet b/Fontes/QA_Base/validation.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0ba391213c4ba42f113984aa66b91813eb01a704 --- /dev/null +++ b/Fontes/QA_Base/validation.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4274efda6a56fcff245fc0ef7470ddf3fd02d36147fba439359ef44354e6ee7a +size 5585515 diff --git "a/Fontes/Wiki_Pages/Amaz\303\264nia Azul Tecnologias de Defesa.txt" "b/Fontes/Wiki_Pages/Amaz\303\264nia Azul Tecnologias de Defesa.txt" new file mode 100644 index 0000000000000000000000000000000000000000..1385fd46e5619a7061a288425ec9aad880f2cd8e --- /dev/null +++ "b/Fontes/Wiki_Pages/Amaz\303\264nia Azul Tecnologias de Defesa.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd6ca5bee0dcfee953031a110d0bf1f1b7f73375c04fe9392b1bf7220a98379 +size 1721 diff --git "a/Fontes/Wiki_Pages/Ambiente pel\303\241gico.txt" "b/Fontes/Wiki_Pages/Ambiente pel\303\241gico.txt" new file mode 100644 index 0000000000000000000000000000000000000000..2c8dda4cc30b5e57823962aab6e883e15ddf7337 --- /dev/null +++ "b/Fontes/Wiki_Pages/Ambiente pel\303\241gico.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:038a305900af0b13133f295ec30241174e4f51b0dec658596aad83c9ab5e87c6 +size 2696 diff --git "a/Fontes/Wiki_Pages/Anel\303\255deo.txt" "b/Fontes/Wiki_Pages/Anel\303\255deo.txt" new file mode 100644 index 0000000000000000000000000000000000000000..36b25c3ad765658bc9ad75f5a7abc538b9454514 --- /dev/null +++ "b/Fontes/Wiki_Pages/Anel\303\255deo.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5d26ed7243d4f57f0434101cffb8e18c8ffeb43688b7eb703a4aafec2d02a59 +size 122421 diff --git "a/Fontes/Wiki_Pages/Atividade Petrol\303\255fera.txt" "b/Fontes/Wiki_Pages/Atividade Petrol\303\255fera.txt" new file mode 100644 index 0000000000000000000000000000000000000000..5cccc7c20d9ad64adac34873cc791210c058687a --- /dev/null +++ "b/Fontes/Wiki_Pages/Atividade Petrol\303\255fera.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3263389d651763e8d163f8ce6c8784c5ec5936981915cd9ea6f1219ac4b5b739 +size 22735 diff --git a/Fontes/Wiki_Pages/Biologia marinha.txt b/Fontes/Wiki_Pages/Biologia marinha.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa6a1a282209ed24147b2d966eeed01df5baf0b5 --- /dev/null +++ b/Fontes/Wiki_Pages/Biologia marinha.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07e46ad2a0106c90395f6e86a4e966b8a73d8059b2a1a7a29b1025824857dd1 +size 19333 diff --git a/Fontes/Wiki_Pages/Cnidaria.txt b/Fontes/Wiki_Pages/Cnidaria.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e3f2f3db15faf49bb203d21b3211c1c16a4e009 --- /dev/null +++ b/Fontes/Wiki_Pages/Cnidaria.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf906f3cde777e8921b959bbdce6b66c01266ba9a128620916ff824d0db8db5e +size 19116 diff --git "a/Fontes/Wiki_Pages/Coloniza\303\247\303\243o do Brasil.txt" "b/Fontes/Wiki_Pages/Coloniza\303\247\303\243o do Brasil.txt" new file mode 100644 index 0000000000000000000000000000000000000000..145c4702c2470c8bc236c912b6d524d89c1b060b --- /dev/null +++ "b/Fontes/Wiki_Pages/Coloniza\303\247\303\243o do Brasil.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7b5acfa75493812c137a4f9f2200731fe66b20525d16a198201b135c876ea9a +size 9822 diff --git "a/Fontes/Wiki_Pages/Conserva\303\247\303\243o das esp\303\251cies conhecidas.txt" "b/Fontes/Wiki_Pages/Conserva\303\247\303\243o das esp\303\251cies conhecidas.txt" new file mode 100644 index 0000000000000000000000000000000000000000..9d145a96cc0161b2e1099c00846a5c6d3e77cd4f --- /dev/null +++ "b/Fontes/Wiki_Pages/Conserva\303\247\303\243o das esp\303\251cies conhecidas.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa628a8530ce3d39d10daa60be196796a5cb8bf2021eb6d28c6ea7444e4d15e +size 13787 diff --git "a/Fontes/Wiki_Pages/Constitui\303\247\303\243o Federal.txt" "b/Fontes/Wiki_Pages/Constitui\303\247\303\243o Federal.txt" new file mode 100644 index 0000000000000000000000000000000000000000..b0ebfc30aaf861c20782f2b9d7f25671ed41160e --- /dev/null +++ "b/Fontes/Wiki_Pages/Constitui\303\247\303\243o Federal.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9729ca295ec91f00006a6c0f6f54b1acad47def4c0d70fdbcf5961925472d64 +size 4729 diff --git "a/Fontes/Wiki_Pages/Defini\303\247\303\243o dos Espa\303\247os Marinhos.txt" "b/Fontes/Wiki_Pages/Defini\303\247\303\243o dos Espa\303\247os Marinhos.txt" new file mode 100644 index 0000000000000000000000000000000000000000..9517e42c3ddfc69e371fb6df16725d8372618b8a --- /dev/null +++ "b/Fontes/Wiki_Pages/Defini\303\247\303\243o dos Espa\303\247os Marinhos.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75dc4306c08ad27ee9c5ab4dcba52c203dd6591fe0b8a710fc69aa86e781cb5f +size 12481 diff --git a/Fontes/Wiki_Pages/Desastres ambientais no ambiente costeiro e marinho.txt b/Fontes/Wiki_Pages/Desastres ambientais no ambiente costeiro e marinho.txt new file mode 100644 index 0000000000000000000000000000000000000000..c444063f61113a0023d72a3ce2c8032a422dfb1c --- /dev/null +++ b/Fontes/Wiki_Pages/Desastres ambientais no ambiente costeiro e marinho.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:975fa4b9a423e8695d81953b6f78e572e7a1c311dcdec7614c84cf98f059c0ab +size 16068 diff --git a/Fontes/Wiki_Pages/Ecossistema marinho.txt b/Fontes/Wiki_Pages/Ecossistema marinho.txt new file mode 100644 index 0000000000000000000000000000000000000000..625f4389dd3a63b36cff501537b1a9fec9521515 --- /dev/null +++ b/Fontes/Wiki_Pages/Ecossistema marinho.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eddfdc7fbaf591cd0e9906c5b0bb4eb2c8ccbf47d9be9792a86fdadde652732f +size 7909 diff --git a/Fontes/Wiki_Pages/Ecossistemas costeiros.txt b/Fontes/Wiki_Pages/Ecossistemas costeiros.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd3ee073654e252d9c9ea5730b05b8f44bbd1c44 --- /dev/null +++ b/Fontes/Wiki_Pages/Ecossistemas costeiros.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b96744e2990d8febe29f98b556c1198e695cc6b58e308ed785cd5c608b5b235 +size 42281 diff --git "a/Fontes/Wiki_Pages/Eros\303\243o e Sedimenta\303\247\303\243o Costeiras.txt" "b/Fontes/Wiki_Pages/Eros\303\243o e Sedimenta\303\247\303\243o Costeiras.txt" new file mode 100644 index 0000000000000000000000000000000000000000..e31fe67bd8bc1358716de25cbbcfe3fad4285ede --- /dev/null +++ "b/Fontes/Wiki_Pages/Eros\303\243o e Sedimenta\303\247\303\243o Costeiras.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bc1994361bef6f05007c53e684260268a81c197014f2dfbe84847a4ede1e5f5 +size 3480 diff --git "a/Fontes/Wiki_Pages/Esportes Mar\303\255timos.txt" "b/Fontes/Wiki_Pages/Esportes Mar\303\255timos.txt" new file mode 100644 index 0000000000000000000000000000000000000000..19bd7a7934d6af1d8b003318f726ac13f5920502 --- /dev/null +++ "b/Fontes/Wiki_Pages/Esportes Mar\303\255timos.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa3f4cb4a71bec1e01aea3f86379487b308d699d504f5f5c7e287285db367fa +size 4078 diff --git "a/Fontes/Wiki_Pages/Evolu\303\247\303\243o territorial do Brasil.txt" "b/Fontes/Wiki_Pages/Evolu\303\247\303\243o territorial do Brasil.txt" new file mode 100644 index 0000000000000000000000000000000000000000..d63e67617568dfca54475de2d288a4217feb3cb9 --- /dev/null +++ "b/Fontes/Wiki_Pages/Evolu\303\247\303\243o territorial do Brasil.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c22f37a3a3c77ae0b5c2ddec78a1c66e9bb84f7664e3c688edc2765d414a4bec +size 29316 diff --git "a/Fontes/Wiki_Pages/For\303\247as Armadas do Brasil.txt" "b/Fontes/Wiki_Pages/For\303\247as Armadas do Brasil.txt" new file mode 100644 index 0000000000000000000000000000000000000000..9fdc7c8b6ed97a4adf87b112a01d4452b2528adc --- /dev/null +++ "b/Fontes/Wiki_Pages/For\303\247as Armadas do Brasil.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74e2d18f844685829b8a397946dbe4727ab56e06e93c042056e3ee3327eccb40 +size 35513 diff --git "a/Fontes/Wiki_Pages/Gera\303\247\303\243o de Energia.txt" "b/Fontes/Wiki_Pages/Gera\303\247\303\243o de Energia.txt" new file mode 100644 index 0000000000000000000000000000000000000000..f79c1f102b023ee000d8947287960e38e8b4ae32 --- /dev/null +++ "b/Fontes/Wiki_Pages/Gera\303\247\303\243o de Energia.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef602c937478dad8558dc285ade3b4823f20c7c9960a1d2004271f35430245b4 +size 1558 diff --git "a/Fontes/Wiki_Pages/Legisla\303\247\303\243o pesqueira e maricultura.txt" "b/Fontes/Wiki_Pages/Legisla\303\247\303\243o pesqueira e maricultura.txt" new file mode 100644 index 0000000000000000000000000000000000000000..4f024ef52ca5ca3e239c8a553000181611ca6a14 --- /dev/null +++ "b/Fontes/Wiki_Pages/Legisla\303\247\303\243o pesqueira e maricultura.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f300f94d5ed6be732b2ec3a77090eba5babe0b9b94db6e6614ef00dbec9c9492 +size 4232 diff --git a/Fontes/Wiki_Pages/Litoral do Brasil.txt b/Fontes/Wiki_Pages/Litoral do Brasil.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc79ed21f315062f3371b534962c6b5ecdc2a47b --- /dev/null +++ b/Fontes/Wiki_Pages/Litoral do Brasil.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c937fb6176bb956217cd009b26b9ecb590fdb3bee6f5a21e70575c28eae94ec9 +size 7362 diff --git a/Fontes/Wiki_Pages/Mar Profundo.txt b/Fontes/Wiki_Pages/Mar Profundo.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a72ecce129b70e4c9cd5a45f8ef89392eda6c12 --- /dev/null +++ b/Fontes/Wiki_Pages/Mar Profundo.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74ee3ac5af33b0c7476bd71c98721ce6b5d677c601cc60cf9122ade51b116191 +size 1528 diff --git a/Fontes/Wiki_Pages/Mar territorial.txt b/Fontes/Wiki_Pages/Mar territorial.txt new file mode 100644 index 0000000000000000000000000000000000000000..26ec2d5d91b2bde01b8c1dddf978be1bd20f36ad --- /dev/null +++ b/Fontes/Wiki_Pages/Mar territorial.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b47babdb078cae1692a87d901d5559570086f36c77f946c08101feb1be5fcc +size 7612 diff --git a/Fontes/Wiki_Pages/Marinha do Brasil.txt b/Fontes/Wiki_Pages/Marinha do Brasil.txt new file mode 100644 index 0000000000000000000000000000000000000000..32c2e3558e1d3be5e3db190da27312749a39b316 --- /dev/null +++ b/Fontes/Wiki_Pages/Marinha do Brasil.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b81009f0b8eea0e634da1a4453751e39bf24523dec2bedbfc3cdc1b13b67d8d0 +size 8731 diff --git a/Fontes/Wiki_Pages/Microbiologia Marinha.txt b/Fontes/Wiki_Pages/Microbiologia Marinha.txt new file mode 100644 index 0000000000000000000000000000000000000000..2647c1e85b595972b6b4aa0775bfec17144c6bd3 --- /dev/null +++ b/Fontes/Wiki_Pages/Microbiologia Marinha.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e391a84809685c2f09207c9e4faaff421123018339be15eb12a76c37c6e32aa1 +size 19503 diff --git "a/Fontes/Wiki_Pages/Minera\303\247\303\243o no Mar.txt" "b/Fontes/Wiki_Pages/Minera\303\247\303\243o no Mar.txt" new file mode 100644 index 0000000000000000000000000000000000000000..ae8601eacb75b330c7191b5d323009d456f1077b --- /dev/null +++ "b/Fontes/Wiki_Pages/Minera\303\247\303\243o no Mar.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fac846bc9bbec1c025537a0b03a5c22ea5da15412bcc18d69e16a542988ab90 +size 3518 diff --git a/Fontes/Wiki_Pages/Moluscos.txt b/Fontes/Wiki_Pages/Moluscos.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e70a904ef664412fb455e50a1f8d2399e97224a --- /dev/null +++ b/Fontes/Wiki_Pages/Moluscos.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec1545cf05d4892d008660b9854923f31706e4c05f4b6a571063a1ff4b03599 +size 10790 diff --git a/Fontes/Wiki_Pages/Pesca e aquicultura.txt b/Fontes/Wiki_Pages/Pesca e aquicultura.txt new file mode 100644 index 0000000000000000000000000000000000000000..584f7c027ef95b4fa52ea27cdb8c8cb36f137da7 --- /dev/null +++ b/Fontes/Wiki_Pages/Pesca e aquicultura.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fa23fa33db846e14cc013c14fb8814c34dc0f718d6ae2833b39af8771347b +size 8419 diff --git "a/Fontes/Wiki_Pages/Polui\303\247\303\243o e contamina\303\247\303\243o marinha.txt" "b/Fontes/Wiki_Pages/Polui\303\247\303\243o e contamina\303\247\303\243o marinha.txt" new file mode 100644 index 0000000000000000000000000000000000000000..5a514d0d801ceed8dee63345f6b77ea240159096 --- /dev/null +++ "b/Fontes/Wiki_Pages/Polui\303\247\303\243o e contamina\303\247\303\243o marinha.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8099e59bc52bdaf36ec85100a462a618a62bb6ede8c56d2a58ece5b1b242f6c1 +size 8660 diff --git a/Fontes/Wiki_Pages/Porifera.txt b/Fontes/Wiki_Pages/Porifera.txt new file mode 100644 index 0000000000000000000000000000000000000000..0097cc84488d9eec2d1c018cdb938d6b221e777f --- /dev/null +++ b/Fontes/Wiki_Pages/Porifera.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcf4fbac8f86549bb89496a5c17c0d9cb0fe9c2458e2dc56ce2ee5cf7723b28d +size 17143 diff --git a/Fontes/Wiki_Pages/Portos.txt b/Fontes/Wiki_Pages/Portos.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3c65c00dd0a0ff9c29909bfa0eb3645b66ec374 --- /dev/null +++ b/Fontes/Wiki_Pages/Portos.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb9673c73969bf232cc1be7194421b66739289f3b088aecafa1f021f7af83310 +size 2937 diff --git "a/Fontes/Wiki_Pages/Produ\303\247\303\243o Prim\303\241ria.txt" "b/Fontes/Wiki_Pages/Produ\303\247\303\243o Prim\303\241ria.txt" new file mode 100644 index 0000000000000000000000000000000000000000..4802a8c28041c2edaaf9c14b26b240b055d49ff6 --- /dev/null +++ "b/Fontes/Wiki_Pages/Produ\303\247\303\243o Prim\303\241ria.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:956699a58ff7cfb0b8288a7ecd2bc0c5700bde9adb000cf21fa771be93be222a +size 10989 diff --git "a/Fontes/Wiki_Pages/Produ\303\247\303\243o prim\303\241ria.txt" "b/Fontes/Wiki_Pages/Produ\303\247\303\243o prim\303\241ria.txt" new file mode 100644 index 0000000000000000000000000000000000000000..31b13576b8872124b919430a7451df74ab04d044 --- /dev/null +++ "b/Fontes/Wiki_Pages/Produ\303\247\303\243o prim\303\241ria.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:174767169e8f9e32d36a4c8ef638eeefb43433a4b31c9a161dcd1b4053c58125 +size 17559 diff --git a/Fontes/Wiki_Pages/Programa de Desenvolvimento de Submarinos.txt b/Fontes/Wiki_Pages/Programa de Desenvolvimento de Submarinos.txt new file mode 100644 index 0000000000000000000000000000000000000000..05ef46c2bb6ad51841488ba195fec0da0189e304 --- /dev/null +++ b/Fontes/Wiki_Pages/Programa de Desenvolvimento de Submarinos.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5603eb219cea632feda939c6c9833361931a2e849699eb66257647a0a39b0a5 +size 36416 diff --git "a/Fontes/Wiki_Pages/Qualidade das \303\201guas.txt" "b/Fontes/Wiki_Pages/Qualidade das \303\201guas.txt" new file mode 100644 index 0000000000000000000000000000000000000000..a535fce4ede2afb98f6d7b086fca1e8a3d517eeb --- /dev/null +++ "b/Fontes/Wiki_Pages/Qualidade das \303\201guas.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a089c4015990de5ca2d85ec5e565cffe716ea71b3b93b6fd50c773a37f599d3 +size 2783 diff --git a/Fontes/Wiki_Pages/Recursos do Mar.txt b/Fontes/Wiki_Pages/Recursos do Mar.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d69caa8ee44142928be9f94a7a410c9907c90ba --- /dev/null +++ b/Fontes/Wiki_Pages/Recursos do Mar.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9bb5e759123f97a0dacab9fee2081f944b35308b6eadc558164838d1e8ca7fb +size 7269 diff --git "a/Fontes/Wiki_Pages/Transporte e Navega\303\247\303\243o.txt" "b/Fontes/Wiki_Pages/Transporte e Navega\303\247\303\243o.txt" new file mode 100644 index 0000000000000000000000000000000000000000..5fe9e2a40a79e2cb80b418cf45c23da02a406bf9 --- /dev/null +++ "b/Fontes/Wiki_Pages/Transporte e Navega\303\247\303\243o.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8509832fb6058be860fdbf5f775d273335bdb5d0b1081f21efc0765e2a19331d +size 4153 diff --git a/Fontes/Wiki_Pages/Turismo Costeiro.txt b/Fontes/Wiki_Pages/Turismo Costeiro.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3852fbfb4b457f2af5a08b81ac0fca66dc4ca63 --- /dev/null +++ b/Fontes/Wiki_Pages/Turismo Costeiro.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:805f0f7663b19444692f72a9453d4914f2448513d5bbe2d99f55ddf60e8854b7 +size 4510 diff --git "a/Fontes/Wiki_Pages/Unidades de Conserva\303\247\303\243o.txt" "b/Fontes/Wiki_Pages/Unidades de Conserva\303\247\303\243o.txt" new file mode 100644 index 0000000000000000000000000000000000000000..4b03b7716ac56169eaa8d9732deeee1f76e84c7e --- /dev/null +++ "b/Fontes/Wiki_Pages/Unidades de Conserva\303\247\303\243o.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488e7459e68aee713424a317ebece514b92c788b1e9730bed4c4216c65ff1bb0 +size 3379 diff --git "a/Fontes/Wiki_Pages/Urbaniza\303\247\303\243o do Brasil.txt" "b/Fontes/Wiki_Pages/Urbaniza\303\247\303\243o do Brasil.txt" new file mode 100644 index 0000000000000000000000000000000000000000..b61b2686eaf046c8b315b5e9822497d8ba3afd03 --- /dev/null +++ "b/Fontes/Wiki_Pages/Urbaniza\303\247\303\243o do Brasil.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b656a9e805998cfd484ed0777d95caea6e20670b5cee1b3e653a3938b51f9f5b +size 9226 diff --git "a/Fontes/Wiki_Pages/Ve\303\255culo a\303\251reo n\303\243o tripulado.txt" "b/Fontes/Wiki_Pages/Ve\303\255culo a\303\251reo n\303\243o tripulado.txt" new file mode 100644 index 0000000000000000000000000000000000000000..3c7eaf7897e8cb7e42052d488051286cc27fca60 --- /dev/null +++ "b/Fontes/Wiki_Pages/Ve\303\255culo a\303\251reo n\303\243o tripulado.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57652bbefed0a0ee8960e2236ae0ebc80c3da75c456519b03a5de0ebda5ddd0d +size 12613 diff --git a/Fontes/Wiki_Pages/Zona Costeira Brasileira.txt b/Fontes/Wiki_Pages/Zona Costeira Brasileira.txt new file mode 100644 index 0000000000000000000000000000000000000000..1634533f44ae6ab01541cb01e05dfa9cefae7ad3 --- /dev/null +++ b/Fontes/Wiki_Pages/Zona Costeira Brasileira.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff07b7733b9b9d1b1b6a2ca137e341d14164166a72f6444411d20f12c7822e92 +size 4877 diff --git a/Fontes/Wiki_Pages/Zona abissal.txt b/Fontes/Wiki_Pages/Zona abissal.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f65cd431bbd4eb82df2e7cd2e36b9b63f0a12a2 --- /dev/null +++ b/Fontes/Wiki_Pages/Zona abissal.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:673021064f3236a390c10767937c03d6ffd50e53d71838a0fd797aabf78bdbee +size 2561 diff --git "a/Fontes/Wiki_Pages/Zona econ\303\264mica exclusiva do Brasil.txt" "b/Fontes/Wiki_Pages/Zona econ\303\264mica exclusiva do Brasil.txt" new file mode 100644 index 0000000000000000000000000000000000000000..4e4dd1d0003d624fbbd8c9d819dfd62af8298d83 --- /dev/null +++ "b/Fontes/Wiki_Pages/Zona econ\303\264mica exclusiva do Brasil.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b518b70f4e72e8d7baf3f2ed41a1f53cb3cfba09ddef2b11a304542d713fb305 +size 2202 diff --git "a/Fontes/Wiki_Pages/Zona pel\303\241gica.txt" "b/Fontes/Wiki_Pages/Zona pel\303\241gica.txt" new file mode 100644 index 0000000000000000000000000000000000000000..4069196d5f059de0c9f572ba2245cf765cf5813b --- /dev/null +++ "b/Fontes/Wiki_Pages/Zona pel\303\241gica.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f75d7f49e2478bd8d41f207f07ba7e020a7c21afc1ab8b652b6ce85d33ac02de +size 2691 diff --git "a/Fontes/Wiki_Pages/Zoologia: Anel\303\255deos marinhos.txt" "b/Fontes/Wiki_Pages/Zoologia: Anel\303\255deos marinhos.txt" new file mode 100644 index 0000000000000000000000000000000000000000..0fa8a6f457c46031a86144a995d85f2a5dddea3d --- /dev/null +++ "b/Fontes/Wiki_Pages/Zoologia: Anel\303\255deos marinhos.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:650ae4f42838ea5169210706d79c01aff71ca8ac95d38521348d074fa6c61bea +size 2384 diff --git "a/Fontes/Wiki_Pages/Zoologia: Cnid\303\241rios.txt" "b/Fontes/Wiki_Pages/Zoologia: Cnid\303\241rios.txt" new file mode 100644 index 0000000000000000000000000000000000000000..51b26bd9b29e222151643ed4a60c7121568d0465 --- /dev/null +++ "b/Fontes/Wiki_Pages/Zoologia: Cnid\303\241rios.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca80b91edc571fd17078b972e806092c72c54c0d45a3d7f234f3555cc2484c00 +size 650 diff --git a/Fontes/Wiki_Pages/Zoologia: Moluscos.txt b/Fontes/Wiki_Pages/Zoologia: Moluscos.txt new file mode 100644 index 0000000000000000000000000000000000000000..aba4d9fc70b0c0637040c2592327f30c4ede155a --- /dev/null +++ b/Fontes/Wiki_Pages/Zoologia: Moluscos.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:404008398d48c617db4a63bdc94f7286d79c0ad872381aed5adb8906faa42b96 +size 985 diff --git "a/Fontes/Wiki_Pages/Zoologia: Por\303\255feros.txt" "b/Fontes/Wiki_Pages/Zoologia: Por\303\255feros.txt" new file mode 100644 index 0000000000000000000000000000000000000000..184dd10fc88bd46080d9b66f2893a7513b925702 --- /dev/null +++ "b/Fontes/Wiki_Pages/Zoologia: Por\303\255feros.txt" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79d263bba576a5c106d30da381f59cdf5e6972cab947bdf74935edaa306242cd +size 2285 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..4e71d4211db53e7a111ba08c4c4a215392558891 --- /dev/null +++ b/app.py @@ -0,0 +1,262 @@ +from typing import List, Optional +import torch +import streamlit as st +import pandas as pd +import random +import time +import logging +from json import JSONDecodeError + +from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig +from haystack import Document +from haystack.document_stores import FAISSDocumentStore +from haystack.modeling.utils import initialize_device_settings +from haystack.nodes import EmbeddingRetriever +from haystack.pipelines import Pipeline +from haystack.nodes.base import BaseComponent +from haystack.schema import Document + +from Project.Fact_Checking_Blue_Amazon.config import ( + RETRIEVER_TOP_K, + RETRIEVER_MODEL, + NLI_MODEL, +) + +class EntailmentChecker(BaseComponent): + """ + This node checks the entailment between every document content and the statement. + It enrichs the documents metadata with entailment informations. + It also returns aggregate entailment information. + """ + + outgoing_edges = 1 + + def __init__( + self, + model_name_or_path: str = "roberta-large-mnli", + model_version: Optional[str] = None, + tokenizer: Optional[str] = None, + use_gpu: bool = True, + batch_size: int = 100, + entailment_contradiction_consideration: float = 0.6, + entailment_contradiction_threshold: float = 0.8 + ): + """ + Load a Natural Language Inference model from Transformers. + + :param model_name_or_path: Directory of a saved model or the name of a public model. + See https://huggingface.co/models for full list of available models. + :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. + :param tokenizer: Name of the tokenizer (usually the same as model) + :param use_gpu: Whether to use GPU (if available). + :param batch_size: Number of Documents to be processed at a time. + :param entailment_contradiction_threshold: Only consider sentences that have entailment or contradiction score greater than this param. + """ + super().__init__() + + self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False) + + tokenizer = tokenizer or model_name_or_path + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer) + self.model = AutoModelForSequenceClassification.from_pretrained( + pretrained_model_name_or_path=model_name_or_path, revision=model_version + ) + self.batch_size = batch_size + self.entailment_contradiction_threshold = entailment_contradiction_threshold + self.entailment_contradiction_consideration = entailment_contradiction_consideration + self.model.to(str(self.devices[0])) + + id2label = AutoConfig.from_pretrained(model_name_or_path).id2label + self.labels = [id2label[k].lower() for k in sorted(id2label)] + if "entailment" not in self.labels: + raise ValueError("The model config must contain entailment value in the id2label dict.") + + def run(self, query: str, documents: List[Document]): + scores, agg_con, agg_neu, agg_ent = 0, 0, 0, 0 + premise_batch = [doc.content for doc in documents] + hypothesis_batch = [query] * len(documents) + entailment_info_batch = self.get_entailment_batch( + premise_batch=premise_batch, hypothesis_batch=hypothesis_batch + ) + considered_documents = [] + for i, (doc, entailment_info) in enumerate(zip(documents, entailment_info_batch)): + doc.meta["entailment_info"] = entailment_info + + con, neu, ent = ( + entailment_info["contradiction"], + entailment_info["neutral"], + entailment_info["entailment"], + ) + + if (con > self.entailment_contradiction_consideration) or (ent > self.entailment_contradiction_consideration): + considered_documents.append(doc) + agg_con += con + agg_neu += neu + agg_ent += ent + scores += 1 + if max(agg_con, agg_ent)/scores > self.entailment_contradiction_threshold: + break + + # if in the first documents there is a strong evidence of entailment/contradiction, + # there is no need to consider less relevant documents + + + aggregate_entailment_info = { + "contradiction": round(agg_con / scores, 2), + "neutral": round(agg_neu / scores, 2), + "entailment": round(agg_ent / scores, 2), + } + + entailment_checker_result = { + "documents": considered_documents[: i + 1], + "aggregate_entailment_info": aggregate_entailment_info, + } + + return entailment_checker_result + + def get_entailment_dict(self, probs): + return {k.lower(): v for k, v in zip(self.labels, probs)} + + def get_entailment_batch(self, premise_batch: List[str], hypothesis_batch: List[str]): + formatted_texts = [ + f"{premise}{self.tokenizer.sep_token}{hypothesis}" + for premise, hypothesis in zip(premise_batch, hypothesis_batch) + ] + with torch.inference_mode(): + inputs = self.tokenizer(formatted_texts, return_tensors="pt", padding=True, truncation=True).to( + self.devices[0] + ) + out = self.model(**inputs) + logits = out.logits + probs_batch = torch.nn.functional.softmax(logits, dim=-1).detach().cpu().numpy() + return [self.get_entailment_dict(probs) for probs in probs_batch] + +# cached to make index and models load only at start +@st.cache_resource +def start_haystack(): + """ + load document store, retriever, entailment checker and create pipeline + """ + document_store = FAISSDocumentStore( + faiss_index_path=f"../data//my_faiss_index.faiss", + faiss_config_path=f"../data/my_faiss_index.json", + ) + print(f"Index size: {document_store.get_document_count()}") + retriever = EmbeddingRetriever( + document_store=document_store, + embedding_model=RETRIEVER_MODEL + ) + entailment_checker = EntailmentChecker( + model_name_or_path=NLI_MODEL, + use_gpu=False, + ) + + pipe = Pipeline() + pipe.add_node(component=retriever, name="retriever", inputs=["Query"]) + pipe.add_node(component=entailment_checker, name="ec", inputs=["retriever"]) + + return pipe + +pipe = start_haystack() + +@st.cache_resource +def check_statement(pipe, statement: str, retriever_top_k: int = 5): + """Run query and verify statement""" + params = {"retriever": {"top_k": retriever_top_k}} + return pipe.run(statement, params=params) + +def set_state_if_absent(key, value): + if key not in st.session_state: + st.session_state[key] = value + +# Small callback to reset the interface in case the text of the question changes +def reset_results(*args): + st.session_state.answer = None + st.session_state.results = None + st.session_state.raw_json = None + +def create_df_for_relevant_snippets(docs): + """ + Create a dataframe that contains all relevant snippets. + """ + rows = [] + for doc in docs: + row = { + "Content": doc.content, + "con": f"{doc.meta['entailment_info']['contradiction']:.2f}", + "neu": f"{doc.meta['entailment_info']['neutral']:.2f}", + "ent": f"{doc.meta['entailment_info']['entailment']:.2f}", + } + rows.append(row) + df = pd.DataFrame(rows) + df["Content"] = df["Content"].str.wrap(75) + df = df.style.apply(highlight_cols) + + return df + +def highlight_cols(s): + coldict = {"con": "#FFA07A", "neu": "#E5E4E2", "ent": "#a9d39e"} + if s.name in coldict.keys(): + return ["background-color: {}".format(coldict[s.name])] * len(s) + return [""] * len(s) + +def main(): + # Persistent state + set_state_if_absent("statement", "") + set_state_if_absent("answer", "") + set_state_if_absent("results", None) + set_state_if_absent("raw_json", None) + + st.write("# Verificação de Sentenças sobre Amazônia Azul") + st.write() + st.markdown( + """ + ##### Insira uma sentença sobre a amazônia azul. + """ + ) + # Search bar + statement = st.text_input( + "", value=st.session_state.statement, max_chars=100, on_change=reset_results + ) + st.markdown("", unsafe_allow_html=True) + + run_pressed = st.button("Run") + run_query = ( + run_pressed or statement != st.session_state.statement + ) + + # Get results for query + if run_query and statement: + time_start = time.time() + reset_results() + st.session_state.statement = statement + with st.spinner("   Procurando a Similaridade no banco de sentenças..."): + try: + st.session_state.results = check_statement(statement, RETRIEVER_TOP_K) + print(f"S: {statement}") + time_end = time.time() + print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())) + print(f"elapsed time: {time_end - time_start}") + except JSONDecodeError as je: + st.error( + "👓    Erro na document store." + ) + return + except Exception as e: + logging.exception(e) + st.error("🐞    Erro Genérico.") + return + + # Display results + if st.session_state.results: + docs = st.session_state.results["documents"] + agg_entailment_info = st.session_state.results["aggregate_entailment_info"] + + st.markdown(f"###### Aggregate entailment information:") + st.write(agg_entailment_info) + st.markdown(f"###### Most Relevant snippets:") + df = create_df_for_relevant_snippets(docs) + + st.dataframe(df) + +main() \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..68e7e6e28f086cad8f39d0d36b238866e872aedb --- /dev/null +++ b/config.py @@ -0,0 +1,4 @@ +RETRIEVER_MODEL = "sentence-transformers/distiluse-base-multilingual-cased-v1" +RETRIEVER_TOP_K = 15 + +NLI_MODEL = "microsoft/deberta-large-mnli" \ No newline at end of file diff --git a/data/my_faiss_index.faiss b/data/my_faiss_index.faiss new file mode 100644 index 0000000000000000000000000000000000000000..63cf9a462827ad962d1d89bd1ffcd9ee91e26d97 --- /dev/null +++ b/data/my_faiss_index.faiss @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b3452ab48213148c29a3dc1f026b35455711cab2ed51725ee70d594c2ef1fce +size 150243373 diff --git a/data/my_faiss_index.json b/data/my_faiss_index.json new file mode 100644 index 0000000000000000000000000000000000000000..30663fe0721d7f15018dcab5f34ffb34d3e692c4 --- /dev/null +++ b/data/my_faiss_index.json @@ -0,0 +1 @@ +{"similarity": "dot_product", "embedding_dim": 512} \ No newline at end of file