Hasan Iqbal commited on
Commit
cbfd993
·
unverified ·
1 Parent(s): cff683c

Added LLM Response Evaluation frontend

Browse files
error_output.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ ["This is a sample LLM response."]
3
+ ```
4
+ ```python
5
+ ["{'text': 'This is a sample LLM response.'}"]
6
+ ```
7
+ def process(input: str):
8
+ sentences = input.replace("{'text': '", "").replace("'}", "").split(". ")
9
+ processed_sentences = [sentences[0] + ".", sentences[1] + "."]
10
+ return processed_sentences
requirements.txt CHANGED
@@ -1,174 +1,172 @@
1
- aiofiles
2
- aiohttp
3
- aiosignal
4
- alabaster
5
- altair
6
- annotated-types
7
- anyio
8
- arrow
9
- async-timeout
10
- asyncio
11
- attrs
12
- Babel
13
- backoff
14
- beautifulsoup4
15
- bibtexparser
16
- blinker
17
- blis
18
- bs4
19
- cachetools
20
- catalogue
21
- certifi
22
- charset-normalizer
23
- click
24
- cloudpathlib
25
- confection
26
- contourpy
27
- cycler
28
- cymem
29
- datasets
30
- Deprecated
31
- dill
32
- distro
33
- dnspython
34
- docutils
35
- email_validator
36
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
37
- evaluate
38
- factool
39
- fake-useragent
40
- fastapi
41
- fastapi-cli
42
- ffmpy
43
- filelock
44
- Flask
45
- fonttools
46
- free-proxy
47
- frozenlist
48
- fsspec
49
- gitdb
50
- GitPython
51
- gradio
52
- gradio_client
53
- h11
54
- httpcore
55
- httptools
56
- httpx
57
- huggingface-hub
58
- idna
59
- imagesize
60
- importlib_resources
61
- itsdangerous
62
- Jinja2
63
- joblib
64
- jsonlines
65
- jsonschema
66
- jsonschema-specifications
67
- kiwisolver
68
- langcodes
69
- language_data
70
- lxml
71
- marisa-trie
72
- markdown-it-py
73
- MarkupSafe
74
- matplotlib
75
- mdurl
76
- mpmath
77
- multidict
78
- multiprocess
79
- murmurhash
80
- networkx
81
- nltk
82
- numpy
83
- openai
84
- orjson
85
- outcome
86
- packaging
87
- pandas
88
- pillow
89
- preshed
90
- protobuf
91
- pyarrow
92
- pyarrow-hotfix
93
- pydantic
94
- pydantic_core
95
- pydeck
96
- pydub
97
- Pygments
98
- pyparsing
99
- PySocks
100
- python-dateutil
101
- python-dotenv
102
- python-multipart
103
- pytz
104
- PyYAML
105
- referencing
106
- regex
107
- requests
108
- rich
109
- rpds-py
110
- ruff
111
- safetensors
112
- scholarly
113
- scikit-learn
114
- scipy
115
- selenium
116
- semantic-version
117
- sentence-transformers
118
- shellingham
119
- six
120
- smart-open
121
- smmap
122
- sniffio
123
- snowballstemmer
124
- sortedcontainers
125
- soupsieve
126
- spacy
127
- spacy-legacy
128
- spacy-loggers
129
- Sphinx
130
- sphinx-rtd-theme
131
- sphinxcontrib-applehelp
132
- sphinxcontrib-devhelp
133
- sphinxcontrib-htmlhelp
134
- sphinxcontrib-jquery
135
- sphinxcontrib-jsmath
136
- sphinxcontrib-qthelp
137
- sphinxcontrib-serializinghtml
138
- srsly
139
- starlette
140
- streamlit
141
- streamlit-option-menu
142
- sympy
143
- tenacity
144
- thinc
145
- threadpoolctl
146
- tokenizers
147
- toml
148
- tomlkit
149
- toolz
150
- torch
151
- torchaudio
152
- torchvision
153
- tornado
154
- tqdm
155
- transformers
156
- trio
157
- trio-websocket
158
- typer
159
- types-python-dateutil
160
- typing_extensions
161
- tzdata
162
- urllib3
163
- uvicorn
164
- uvloop
165
- wasabi
166
- watchfiles
167
- weasel
168
- websocket-client
169
- websockets
170
- Werkzeug
171
- wrapt
172
- wsproto
173
- xxhash
174
- yarl
 
 
1
+ aiohttp==3.8.4
2
+ aiosignal==1.3.1
3
+ alabaster==0.7.16
4
+ altair==5.3.0
5
+ annotated-types==0.7.0
6
+ anyio==4.4.0
7
+ arrow==1.3.0
8
+ async-timeout==4.0.3
9
+ asyncio==3.4.3
10
+ attrs==23.2.0
11
+ Babel==2.15.0
12
+ backoff==2.2.1
13
+ beautifulsoup4==4.12.3
14
+ bibtexparser==1.4.1
15
+ blinker==1.8.2
16
+ blis==0.7.11
17
+ bs4==0.0.2
18
+ cachetools==5.4.0
19
+ catalogue==2.0.10
20
+ certifi==2024.7.4
21
+ charset-normalizer==3.3.2
22
+ click==8.1.7
23
+ cloudpathlib==0.18.1
24
+ confection==0.1.5
25
+ contourpy==1.2.1
26
+ cycler==0.12.1
27
+ cymem==2.0.8
28
+ datasets==2.20.0
29
+ Deprecated==1.2.14
30
+ dill==0.3.8
31
+ distro==1.9.0
32
+ docutils==0.20.1
 
 
 
33
  en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
34
+ entrypoints==0.4
35
+ evaluate==0.4.2
36
+ factool==0.1.3
37
+ fake-useragent==1.5.1
38
+ Faker==26.0.0
39
+ fastapi==0.96.0
40
+ favicon==0.7.0
41
+ filelock==3.15.4
42
+ fonttools==4.53.1
43
+ free-proxy==1.1.1
44
+ frozenlist==1.4.1
45
+ fsspec==2024.5.0
46
+ gitdb==4.0.11
47
+ GitPython==3.1.43
48
+ h11==0.14.0
49
+ htbuilder==0.6.2
50
+ httpcore==1.0.5
51
+ httpx==0.27.0
52
+ huggingface-hub==0.24.2
53
+ idna==3.7
54
+ imagesize==1.4.1
55
+ Jinja2==3.1.4
56
+ joblib==1.4.2
57
+ jsonlines==4.0.0
58
+ jsonschema==4.23.0
59
+ jsonschema-specifications==2023.12.1
60
+ kiwisolver==1.4.5
61
+ langcodes==3.4.0
62
+ language_data==1.2.0
63
+ lxml==5.2.2
64
+ marisa-trie==1.2.0
65
+ Markdown==3.6
66
+ markdown-it-py==3.0.0
67
+ markdownlit==0.0.7
68
+ MarkupSafe==2.1.5
69
+ matplotlib==3.9.1
70
+ mdurl==0.1.2
71
+ more-itertools==10.3.0
72
+ mpmath==1.3.0
73
+ multidict==6.0.5
74
+ multiprocess==0.70.16
75
+ murmurhash==1.0.10
76
+ networkx==3.3
77
+ nltk==3.8.1
78
+ numpy==1.26.4
79
+ openai==1.37.0
80
+ outcome==1.3.0.post0
81
+ packaging==24.1
82
+ pandas==2.2.2
83
+ pillow==10.4.0
84
+ preshed==3.0.9
85
+ prometheus_client==0.20.0
86
+ protobuf==5.27.2
87
+ pyarrow==17.0.0
88
+ pyarrow-hotfix==0.6
89
+ pydantic==1.10.9
90
+ pydantic_core==2.20.1
91
+ pydeck==0.9.1
92
+ Pygments==2.18.0
93
+ pymdown-extensions==10.8.1
94
+ pyparsing==3.1.2
95
+ PySocks==1.7.1
96
+ python-dateutil==2.9.0.post0
97
+ python-dotenv==1.0.1
98
+ pytz==2024.1
99
+ PyYAML==6.0
100
+ referencing==0.35.1
101
+ regex==2024.7.24
102
+ requests==2.32.3
103
+ rich==13.7.1
104
+ rpds-py==0.19.1
105
+ safetensors==0.4.3
106
+ scholarly==1.7.11
107
+ scikit-learn==1.5.1
108
+ scipy==1.14.0
109
+ selenium==4.23.1
110
+ sentence-transformers==3.0.1
111
+ shellingham==1.5.4
112
+ six==1.16.0
113
+ smart-open==7.0.4
114
+ smmap==5.0.1
115
+ sniffio==1.3.1
116
+ snowballstemmer==2.2.0
117
+ sortedcontainers==2.4.0
118
+ soupsieve==2.5
119
+ spacy==3.7.5
120
+ spacy-legacy==3.0.12
121
+ spacy-loggers==1.0.5
122
+ Sphinx==7.4.7
123
+ sphinx-rtd-theme==2.0.0
124
+ sphinxcontrib-applehelp==1.0.8
125
+ sphinxcontrib-devhelp==1.0.6
126
+ sphinxcontrib-htmlhelp==2.0.6
127
+ sphinxcontrib-jquery==4.1
128
+ sphinxcontrib-jsmath==1.0.1
129
+ sphinxcontrib-qthelp==1.0.8
130
+ sphinxcontrib-serializinghtml==1.1.10
131
+ srsly==2.4.8
132
+ st-annotated-text==4.0.1
133
+ st-theme==1.2.3
134
+ starlette==0.27.0
135
+ streamlit==1.36.0
136
+ streamlit-camera-input-live==0.2.0
137
+ streamlit-card==1.0.2
138
+ streamlit-embedcode==0.1.2
139
+ streamlit-extras==0.4.3
140
+ streamlit-faker==0.0.3
141
+ streamlit-image-coordinates==0.1.9
142
+ streamlit-keyup==0.2.4
143
+ streamlit-option-menu==0.3.13
144
+ streamlit-toggle-switch==1.0.2
145
+ streamlit-vertical-slider==2.5.5
146
+ sympy==1.13.1
147
+ tenacity==8.5.0
148
+ thinc==8.2.5
149
+ threadpoolctl==3.5.0
150
+ tokenizers==0.19.1
151
+ toml==0.10.2
152
+ toolz==0.12.1
153
+ torch==2.4.0
154
+ tornado==6.4.1
155
+ tqdm==4.66.4
156
+ transformers==4.43.2
157
+ trio==0.26.0
158
+ trio-websocket==0.11.1
159
+ typer==0.12.3
160
+ types-python-dateutil==2.9.0.20240316
161
+ typing_extensions==4.12.2
162
+ tzdata==2024.1
163
+ urllib3==2.2.2
164
+ uvicorn==0.22.0
165
+ validators==0.33.0
166
+ wasabi==1.1.3
167
+ weasel==0.4.1
168
+ websocket-client==1.8.0
169
+ wrapt==1.16.0
170
+ wsproto==1.2.0
171
+ xxhash==3.4.1
172
+ yarl==1.9.4
scripts/app.sh CHANGED
@@ -11,4 +11,4 @@ source "${BASH_SOURCE%/*}/common.sh"
11
 
12
  # Executing Python script
13
  export PYTHONPATH="$PYTHONPATH:src/"
14
- gradio src/openfactcheck/app/app.py --demo-name=demo
 
11
 
12
  # Executing Python script
13
  export PYTHONPATH="$PYTHONPATH:src/"
14
+ streamlit run src/openfactcheck/app/app.py -- "$@"
src/openfactcheck/app/app.py CHANGED
@@ -1,11 +1,31 @@
 
1
  import streamlit as st
2
  from streamlit_option_menu import option_menu
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  class App:
5
  def __init__(self):
6
  pass
7
 
8
- def run(self):
 
 
 
9
 
10
  # Set up Dashboard
11
  st.set_page_config(page_title="OpenFactCheck Dashboard",
@@ -24,6 +44,20 @@ class App:
24
  orientation="horizontal"
25
  )
26
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  if __name__ == "__main__":
 
 
28
  app = App()
29
- app.run()
 
1
+ import argparse
2
  import streamlit as st
3
  from streamlit_option_menu import option_menu
4
 
5
+ from openfactcheck.core.base import OpenFactCheck, OpenFactCheckConfig
6
+ from openfactcheck.app.evaluate_response import evaluate_response
7
+
8
+ def parse_args():
9
+ parser = argparse.ArgumentParser(description='Initialize OpenFactCheck with custom configuration.')
10
+
11
+ # Add arguments here, example:
12
+ parser.add_argument("--config-path",
13
+ type=str,
14
+ help="Config File Path",
15
+ default="config.json")
16
+
17
+ # Parse arguments from command line
18
+ args = parser.parse_args()
19
+ return args
20
+
21
  class App:
22
  def __init__(self):
23
  pass
24
 
25
+ def run(self, config_path: str = "config.json"):
26
+ # Initialize OpenFactCheck
27
+ config = OpenFactCheckConfig(config_path)
28
+ ofc = OpenFactCheck(config)
29
 
30
  # Set up Dashboard
31
  st.set_page_config(page_title="OpenFactCheck Dashboard",
 
44
  orientation="horizontal"
45
  )
46
 
47
+ # Load the selected page
48
+ if selected == "Evaluate LLM Response":
49
+ evaluate_response(ofc)
50
+ # elif selected == "Evaluate LLM":
51
+ # evaluate_llm()
52
+ # elif selected == "Evaluate FactChecker":
53
+ # evaluate_factchecker()
54
+ # elif selected == "Leaderboards":
55
+ # leaderboards()
56
+ # else:
57
+ # about()
58
+
59
  if __name__ == "__main__":
60
+ args = parse_args()
61
+
62
  app = App()
63
+ app.run(args.config_path)
src/openfactcheck/app/evaluate_response.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+ import pandas as pd
4
+ import streamlit as st
5
+
6
+ from openfactcheck.core.base import OpenFactCheck
7
+ from openfactcheck.app.utils import style_metric_cards
8
+
9
+ # Create a function to check a LLM response
10
+ def evaluate_response(ofc: OpenFactCheck):
11
+ """
12
+ This function creates a Streamlit app to evaluate the factuality of a LLM response.
13
+ """
14
+ if 'response' not in st.session_state:
15
+ st.session_state.response = None
16
+
17
+ # Initialize the solvers
18
+ claimprocessors = ofc.list_claimprocessors()
19
+ retrievers = ofc.list_retrievers()
20
+ verifiers = ofc.list_verifiers()
21
+
22
+ st.write("This is where you can check factuality of a LLM response.")
23
+
24
+ # Customize FactChecker
25
+ st.write("Customize FactChecker")
26
+
27
+ # Dropdown in three columns
28
+ col1, col2, col3 = st.columns(3)
29
+ with col1:
30
+ claimprocessor = st.selectbox("Select Claim Processor", list(claimprocessors))
31
+ with col2:
32
+ retriever = st.selectbox("Select Retriever", list(retrievers))
33
+ with col3:
34
+ verifier = st.selectbox("Select Verifier", list(verifiers))
35
+
36
+ # Input
37
+ input_text = {"text": st.text_area("Enter LLM response here", "This is a sample LLM response.")}
38
+
39
+ # Button to check factuality
40
+ if st.button("Check Factuality"):
41
+ with st.status("Checking factuality...", expanded=True) as status:
42
+ # Configure the pipeline
43
+ st.write("Configuring pipeline...")
44
+ ofc.init_pipeline_manually([claimprocessor, retriever, verifier])
45
+ st.write("Pipeline configured...")
46
+
47
+ # Evaluate the response
48
+ st.write("Evaluating response...")
49
+
50
+ response = ofc(input_text, stream=True)
51
+ st.write("Response evaluated...")
52
+
53
+ status.update(label="Factuality checked...", state="complete", expanded=False)
54
+
55
+ # Display pipeline configuration
56
+ pipeline_str = "   ┈➤   ".join([claimprocessor, retriever, verifier])
57
+ st.info(f"""**Pipeline**:    \n{pipeline_str}""")
58
+
59
+ # Store the final response in the session state
60
+ st.session_state.final_response = None
61
+
62
+ col1, col2 = st.columns([3, 1])
63
+ with col1:
64
+ def process_stream(responses):
65
+ """
66
+ Process each response from the stream as a simulated chat output.
67
+ This function yields each word from the formatted text of the response,
68
+ adding a slight delay to simulate typing in a chat.
69
+ """
70
+
71
+ for response in responses:
72
+ if "claimprocessor" in response["solver_name"]:
73
+ # Extract response details
74
+ output_text = response["output"]
75
+
76
+ # Get the number of detected claims
77
+ detected_claims = output_text.get("claims", [])
78
+
79
+ def extract_text(claim):
80
+ """
81
+ Extracts text from a claim that might be a string formatted as a dictionary.
82
+ """
83
+ # Try to extract text using regular expression if claim is a string formatted as a dictionary
84
+ match = re.search(r"'text': '([^']+)'", claim)
85
+ if match:
86
+ return match.group(1)
87
+ return claim # Return as is if no dictionary format detected
88
+
89
+ # Generate formatted text with enumerated claims in Markdown format
90
+ formatted_text = "#### Detected Claims\n" + "\n".join(f"{i}. {extract_text(claim)}" for i, claim in enumerate(detected_claims, start=1)) + "\n"
91
+
92
+ with col2:
93
+ st.metric(label="Detected Claims", value=len(detected_claims))
94
+ style_metric_cards(background_color="#F0F0F0", border_color="#F0F0F0", border_radius_px=0)
95
+
96
+ # Yield each word with a space and simulate typing by sleeping
97
+ for word in formatted_text.split(" "):
98
+ yield word + " "
99
+ time.sleep(0.01)
100
+
101
+ st.session_state.claimprocessor_flag = True
102
+
103
+ elif "retriever" in response["solver_name"]:
104
+ # Extract response details
105
+ output_text = response["output"]
106
+
107
+ evidences = []
108
+ for _, claim_with_evidences in output_text.get("claims_with_evidences", {}).items():
109
+ for evidence in claim_with_evidences:
110
+ evidences.append(evidence[1])
111
+
112
+ # Generate formatted text with enumerated evidences in Markdown format
113
+ formatted_text = "#### Retrieved Evidences\n" + "\n".join(f"{i}. {evidence}" for i, evidence in enumerate(evidences, start=1))
114
+
115
+ with col2:
116
+ st.metric(label="Retrieved Evidences", value=len(evidences))
117
+ style_metric_cards(background_color="#F0F0F0", border_color="#F0F0F0", border_radius_px=0)
118
+
119
+ # Yield each word with a space and simulate typing by sleeping
120
+ for word in formatted_text.split(" "):
121
+ yield word + " "
122
+ time.sleep(0.01)
123
+
124
+ elif "verifier" in response["solver_name"]:
125
+ # Extract response details
126
+ output_text = response["output"]
127
+
128
+ # Store the final response in the session state
129
+ st.session_state.final_response = output_text
130
+
131
+ # Yield each word with a space and simulate typing by sleeping
132
+ for word in formatted_text.split(" "):
133
+ yield word + " "
134
+ time.sleep(0.01)
135
+
136
+ st.write_stream(process_stream(response))
137
+
138
+ # Process the final response
139
+ final_response = st.session_state.final_response
140
+ if final_response is not None:
141
+ overall_factuality = final_response.get("label", "Unknown")
142
+ with col2:
143
+ if overall_factuality == True:
144
+ st.metric(label="Overall Factuality", value="True")
145
+ style_metric_cards(background_color="#D4EDDA", border_color="#D4EDDA", border_radius_px=0, border_left_color="#28A745")
146
+ elif overall_factuality == False:
147
+ st.metric(label="Overall Factuality", value="False")
148
+ style_metric_cards(background_color="#F8D7DA", border_color="#F8D7DA", border_radius_px=0, border_left_color="#DC3545")
149
+
150
+ # Button to reset
151
+ if st.session_state.response is not None:
152
+ if st.button("Reset"):
153
+ st.session_state.response = None
154
+ st.rerun()
src/openfactcheck/app/utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def style_metric_cards(
4
+ background_color: str = "#FFF",
5
+ border_size_px: int = 1,
6
+ border_color: str = "#CCC",
7
+ border_radius_px: int = 5,
8
+ border_left_color: str = "#9AD8E1",
9
+ ) -> None:
10
+ """
11
+ Applies a custom style to st.metrics in the page
12
+
13
+ Args:
14
+ background_color (str, optional): Background color. Defaults to "#FFF".
15
+ border_size_px (int, optional): Border size in pixels. Defaults to 1.
16
+ border_color (str, optional): Border color. Defaults to "#CCC".
17
+ border_radius_px (int, optional): Border radius in pixels. Defaults to 5.
18
+ border_left_color (str, optional): Borfer left color. Defaults to "#9AD8E1".
19
+ box_shadow (bool, optional): Whether a box shadow is applied. Defaults to True.
20
+ """
21
+ st.markdown(
22
+ f"""
23
+ <style>
24
+ div[data-testid="stMetric"],
25
+ div[data-testid="metric-container"] {{
26
+ background-color: {background_color};
27
+ border: {border_size_px}px solid {border_color};
28
+ padding: 5% 5% 5% 10%;
29
+ border-radius: {border_radius_px}px;
30
+ border-left: 0.5rem solid {border_left_color} !important;
31
+ }}
32
+ </style>
33
+ """,
34
+ unsafe_allow_html=True,
35
+ )
src/openfactcheck/core/base.py CHANGED
@@ -1,10 +1,11 @@
1
  import os
2
  import sys
 
3
  import tqdm
4
- import yaml
5
  import json
6
  import traceback
7
  from pathlib import Path
 
8
 
9
  from openfactcheck.lib.logger import logger
10
  from openfactcheck.lib.config import OpenFactCheckConfig
@@ -12,6 +13,66 @@ from openfactcheck.core.solver import SOLVER_REGISTRY, Solver
12
  from openfactcheck.core.state import FactCheckerState
13
 
14
  class OpenFactCheck:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def __init__(self, config: OpenFactCheckConfig):
16
  """
17
  Initialize OpenFactCheck with the given configuration.
@@ -61,6 +122,45 @@ class OpenFactCheck:
61
  """
62
  return SOLVER_REGISTRY
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def init_solver(self, solver_name, args):
65
  """
66
  Initialize a solver with the given configuration
@@ -130,36 +230,85 @@ class OpenFactCheck:
130
  with open(os.path.join(self.output_path, f'{sample_name}.jsonl'), 'a', encoding="utf-8") as f:
131
  f.write(json.dumps(result, ensure_ascii=False) + '\n')
132
 
133
- def __call__(self, response: str, question: str = None, callback_fun=None, **kwargs):
134
- sample_name = kwargs.get("sample_name", 0)
135
- solver_output = FactCheckerState(question=question, response=response)
136
- oname = "response"
137
- for idx, (name, (solver, iname, oname)) in tqdm.tqdm(enumerate(self.pipeline.items()),
138
- total=len(self.pipeline)):
139
- logger.info(f"Invoking solver: {idx}-{name}")
140
- logger.debug(f"State content: {solver_output}")
141
- try:
142
- solver_input = solver_output
143
- cont, solver_output = solver(solver_input, **kwargs)
144
- logger.debug(f"Latest result: {solver_output}")
145
- if callback_fun:
146
- callback_fun(
147
- index=idx,
148
- sample_name=sample_name,
149
- solver_name=name,
150
- input_name=iname,
151
- output_name=oname,
152
- input=solver_input.__dict__,
153
- output=solver_output.__dict__,
154
- continue_run=cont
155
- )
156
- self.persist_output(solver_output, idx, name, cont, sample_name=sample_name)
157
- except:
158
- print(traceback.format_exc())
159
- cont = False
160
- oname = iname
161
- if not cont:
162
- logger.info(f"Break at {name}")
163
- break
164
-
165
- return solver_output.get(oname)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import sys
3
+ import uuid
4
  import tqdm
 
5
  import json
6
  import traceback
7
  from pathlib import Path
8
+ from typing import Callable
9
 
10
  from openfactcheck.lib.logger import logger
11
  from openfactcheck.lib.config import OpenFactCheckConfig
 
13
  from openfactcheck.core.state import FactCheckerState
14
 
15
  class OpenFactCheck:
16
+ """
17
+ OpenFactCheck class to evaluate the factuality of a response using a pipeline of solvers.
18
+
19
+ Parameters
20
+ ----------
21
+ config : OpenFactCheckConfig
22
+ An instance of OpenFactCheckConfig containing the configuration
23
+ settings for OpenFactCheck.
24
+
25
+ Attributes
26
+ ----------
27
+ logger : Logger
28
+ An instance of the logger to log messages.
29
+ config : OpenFactCheckConfig
30
+ An instance of OpenFactCheckConfig containing the configuration
31
+ settings for OpenFactCheck.
32
+ solver_configs : dict
33
+ A dictionary containing the configuration settings for the solvers.
34
+ pipeline : list
35
+ A list of solvers to be included in the pipeline.
36
+ output_path : str
37
+ The path to the output directory where the results will be stored.
38
+
39
+ Methods
40
+ -------
41
+ load_solvers(solver_paths)
42
+ Load solvers from the given paths.
43
+ list_solvers()
44
+ List all registered solvers.
45
+ list_claimprocessors()
46
+ List all registered claim processors.
47
+ list_retrievers()
48
+ List all registered retrievers.
49
+ list_verifiers()
50
+ List all registered verifiers.
51
+ init_solver(solver_name, args)
52
+ Initialize a solver with the given configuration.
53
+ init_solvers()
54
+ Initialize all registered solvers.
55
+ init_pipeline()
56
+ Initialize the pipeline with the given configuration.
57
+ init_pipeline_manually(pipeline)
58
+ Initialize the pipeline with the given configuration.
59
+ persist_output(state, idx, solver_name, cont, sample_name=0)
60
+ Persist the output of the solver.
61
+ read_output(sample_name)
62
+ Read the output file for the given sample.
63
+ remove_output(sample_name)
64
+ Remove the output file for the given sample.
65
+ __call__(response, question, callback_fun, **kwargs)
66
+ Evaluate the response using the pipeline.
67
+
68
+ Examples
69
+ --------
70
+ >>> config = OpenFactCheckConfig("config.json")
71
+ >>> ofc = OpenFactCheck(config)
72
+ >>> response, sample_name = ofc("This is a sample response.")
73
+ >>> output = ofc.read_output(sample_name)
74
+ >>> ofc.remove_output(sample_name)
75
+ """
76
  def __init__(self, config: OpenFactCheckConfig):
77
  """
78
  Initialize OpenFactCheck with the given configuration.
 
122
  """
123
  return SOLVER_REGISTRY
124
 
125
+ @staticmethod
126
+ def list_claimprocessors():
127
+ """
128
+ List all registered claim processors
129
+ """
130
+ # Get all claim processors
131
+ claimprocessors = {}
132
+ for solver, value in SOLVER_REGISTRY.items():
133
+ if "claimprocessor" in solver:
134
+ claimprocessors[solver] = value
135
+
136
+ return claimprocessors
137
+
138
+ @staticmethod
139
+ def list_retrievers():
140
+ """
141
+ List all registered retrievers
142
+ """
143
+ # Get all retrievers
144
+ retrievers = {}
145
+ for solver, value in SOLVER_REGISTRY.items():
146
+ if "retriever" in solver:
147
+ retrievers[solver] = value
148
+
149
+ return retrievers
150
+
151
+ @staticmethod
152
+ def list_verifiers():
153
+ """
154
+ List all registered verifiers
155
+ """
156
+ # Get all verifiers
157
+ verifiers = {}
158
+ for solver, value in SOLVER_REGISTRY.items():
159
+ if "verifier" in solver:
160
+ verifiers[solver] = value
161
+
162
+ return verifiers
163
+
164
  def init_solver(self, solver_name, args):
165
  """
166
  Initialize a solver with the given configuration
 
230
  with open(os.path.join(self.output_path, f'{sample_name}.jsonl'), 'a', encoding="utf-8") as f:
231
  f.write(json.dumps(result, ensure_ascii=False) + '\n')
232
 
233
+ def read_output(self, sample_name):
234
+ """
235
+ Read the output file for the given sample
236
+ """
237
+ with open(os.path.join(self.output_path, f'{sample_name}.jsonl'), 'r', encoding="utf-8") as f:
238
+ return [json.loads(line) for line in f]
239
+
240
+ def remove_output(self, sample_name):
241
+ """
242
+ Remove the output file for the given sample
243
+ """
244
+ os.remove(os.path.join(self.output_path, f'{sample_name}.jsonl'))
245
+
246
+ def __call__(self, response: str, question: str = None, stream: bool = False, callback: Callable = None, **kwargs):
247
+ """
248
+ Evaluate the response using the pipeline
249
+ """
250
+
251
+ def evaluate_response():
252
+ # Check if sample_name is provided in kwargs else generate a random one
253
+ sample_name = kwargs.get("sample_name", str(uuid.uuid4().hex[:6]))
254
+
255
+ # Initialize the state
256
+ solver_output = FactCheckerState(question=question, response=response)
257
+
258
+ # Initialize the output name
259
+ output_name = "response"
260
+ for idx, (name, (solver, input_name, output_name)) in tqdm.tqdm(enumerate(self.pipeline.items()),
261
+ total=len(self.pipeline)):
262
+ logger.info(f"Invoking solver: {idx}-{name}")
263
+ logger.info(f"State content: {solver_output}")
264
+
265
+ try:
266
+ # Solver input is the output of the previous solver
267
+ solver_input = solver_output
268
+
269
+ # Run the solver
270
+ cont, solver_output = solver(solver_input, **kwargs)
271
+
272
+ # Persist the output
273
+ logger.debug(f"Latest result: {solver_output}")
274
+ if callback:
275
+ callback(
276
+ index=idx,
277
+ sample_name=sample_name,
278
+ solver_name=name,
279
+ input_name=input_name,
280
+ output_name=output_name,
281
+ input=solver_input.__dict__,
282
+ output=solver_output.__dict__,
283
+ continue_run=cont
284
+ )
285
+
286
+ # Stream the output
287
+ if stream:
288
+ yield {
289
+ "index": idx,
290
+ "solver_name": name,
291
+ "input_name": input_name,
292
+ "output_name": output_name,
293
+ "input": solver_input.__dict__,
294
+ "output": solver_output.__dict__,
295
+ "continue_run": cont
296
+ }
297
+
298
+ self.persist_output(solver_output, idx, name, cont, sample_name=sample_name)
299
+
300
+ except:
301
+ logger.error(f"Error at {traceback.format_exc()}")
302
+ cont = False
303
+ output_name = input_name
304
+
305
+ # Break if the solver returns False
306
+ if not cont:
307
+ logger.info(f"Break at {name}")
308
+ break
309
+
310
+ if not stream:
311
+ return solver_output.get(output_name)
312
+
313
+ # Execute the generator if stream is True, otherwise process normally
314
+ return evaluate_response()
src/openfactcheck/solvers/rarr_solvers/rarr_agreement_gate.py CHANGED
@@ -14,7 +14,7 @@ class RARRAgreementGate(StandardTaskSolver):
14
  def __init__(self, args):
15
  super().__init__(args)
16
  self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
17
- self.model = self.global_config.get("model", "text-davinci-003")
18
 
19
  def __call__(self, state: FactCheckerState, *args, **kwargs):
20
  claims = state.get(self.input_name)
 
14
  def __init__(self, args):
15
  super().__init__(args)
16
  self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
17
+ self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
18
 
19
  def __call__(self, state: FactCheckerState, *args, **kwargs):
20
  claims = state.get(self.input_name)
src/openfactcheck/solvers/rarr_solvers/rarr_editor.py CHANGED
@@ -14,8 +14,8 @@ import Levenshtein
14
  class RARREditor(StandardTaskSolver):
15
  def __init__(self, args):
16
  super().__init__(args)
17
- self.model = self.global_config.get("model", "text-davinci-003")
18
- # self.model = args.get("model", "text-davinci-003")
19
  self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
20
  self.max_edit_ratio = args.get("max_edit_ratio", 100)
21
  self.output_claim_only = args.get("output_claim_only", False)
 
14
  class RARREditor(StandardTaskSolver):
15
  def __init__(self, args):
16
  super().__init__(args)
17
+ self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
18
+ # self.model = args.get("model", "gpt-3.5-turbo-instruct")
19
  self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
20
  self.max_edit_ratio = args.get("max_edit_ratio", 100)
21
  self.output_claim_only = args.get("output_claim_only", False)
src/openfactcheck/solvers/rarr_solvers/rarr_llm_retriever.py CHANGED
@@ -13,7 +13,7 @@ from .prompts.hallucination_prompts import EVIDENCE_HALLUCINATION
13
  class LLMRetriever(StandardTaskSolver):
14
  def __init__(self, args):
15
  super().__init__(args)
16
- self.model = self.global_config.get("model", "text-davinci-003")
17
 
18
  def __call__(self, state: FactCheckerState, *args, **kwargs):
19
  claims = state.get(self.input_name)
 
13
  class LLMRetriever(StandardTaskSolver):
14
  def __init__(self, args):
15
  super().__init__(args)
16
+ self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
17
 
18
  def __call__(self, state: FactCheckerState, *args, **kwargs):
19
  claims = state.get(self.input_name)
src/openfactcheck/solvers/rarr_solvers/rarr_question_generator.py CHANGED
@@ -17,7 +17,7 @@ from .prompts import rarr_prompts
17
  class RARRQuestionGenerator(StandardTaskSolver):
18
  def __init__(self, args):
19
  super().__init__(args)
20
- self.model = self.global_config.get("model", "text-davinci-003")
21
  self.temperature_qgen = args.get("temperature_qgen", 0.7)
22
  self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
23
 
 
17
  class RARRQuestionGenerator(StandardTaskSolver):
18
  def __init__(self, args):
19
  super().__init__(args)
20
+ self.model = self.global_config.get("model", "gpt-3.5-turbo-instruct")
21
  self.temperature_qgen = args.get("temperature_qgen", 0.7)
22
  self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
23
 
src/openfactcheck/solvers/tutorial_solvers/utils/api.py CHANGED
@@ -34,7 +34,7 @@ def chatgpt(user_input):
34
 
35
  def davinci(prompt):
36
  # Set up the model and prompt
37
- model_engine = "text-davinci-003"
38
 
39
  # Generate a response
40
  completion = client.completions.create(
 
34
 
35
  def davinci(prompt):
36
  # Set up the model and prompt
37
+ model_engine = "gpt-3.5-turbo-instruct"
38
 
39
  # Generate a response
40
  completion = client.completions.create(
src/openfactcheck/solvers/webservice/rarr_rtv.py CHANGED
@@ -10,7 +10,7 @@ from .rarr_utils import search
10
  class RARRRetriever(StandardTaskSolver):
11
  def __init__(self, args):
12
  super().__init__(args)
13
- self.model = self.global_config.get("rarr_model", "text-davinci-003")
14
  self.temperature_qgen = args.get("temperature_qgen", 0.7)
15
  self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
16
  self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
 
10
  class RARRRetriever(StandardTaskSolver):
11
  def __init__(self, args):
12
  super().__init__(args)
13
+ self.model = self.global_config.get("rarr_model", "gpt-3.5-turbo-instruct")
14
  self.temperature_qgen = args.get("temperature_qgen", 0.7)
15
  self.num_rounds_qgen = args.get("num_rounds_qgen", 3)
16
  self.max_search_results_per_query = args.get("max_search_results_per_query", 5)
src/openfactcheck/solvers/webservice/rarr_vfr.py CHANGED
@@ -10,7 +10,7 @@ class RARRAgreementGate(StandardTaskSolver):
10
  def __init__(self, args):
11
  super().__init__(args)
12
  self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
13
- self.model = self.global_config.get("rarr_model", "text-davinci-003")
14
 
15
  def __call__(self, state: FactCheckerState, *args, **kwargs):
16
  claims_with_evidences = state.get(self.input_name)
 
10
  def __init__(self, args):
11
  super().__init__(args)
12
  self.max_evidences_per_question = args.get("max_evidences_per_question", 1)
13
+ self.model = self.global_config.get("rarr_model", "gpt-3.5-turbo-instruct")
14
 
15
  def __call__(self, state: FactCheckerState, *args, **kwargs):
16
  claims_with_evidences = state.get(self.input_name)