SaylorTwift HF Staff commited on
Commit
ac706ba
·
verified ·
1 Parent(s): d11269c

Upload folder using huggingface_hub

Browse files
.eval-set-id ADDED
@@ -0,0 +1 @@
 
 
1
+ TwwWPS3WZyRpXhoRN2kcQ9
2025-12-16T12-28-27+00-00_hf-OpenEvals-SimpleQA_XFAsiMeTjZD9msWpDbxxHC.eval ADDED
Binary file (14.2 kB). View file
 
eval-set.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_set_id": "TwwWPS3WZyRpXhoRN2kcQ9",
3
+ "tasks": [
4
+ {
5
+ "name": "hf/OpenEvals/SimpleQA",
6
+ "task_id": "XFAsiMeTjZD9msWpDbxxHC",
7
+ "task_args": {},
8
+ "model": "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct",
9
+ "model_args": {},
10
+ "sequence": 0
11
+ }
12
+ ]
13
+ }
logs.json ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "2025-12-16T12-28-27+00-00_hf-OpenEvals-SimpleQA_XFAsiMeTjZD9msWpDbxxHC.eval": {
3
+ "version": 2,
4
+ "status": "success",
5
+ "eval": {
6
+ "eval_set_id": "TwwWPS3WZyRpXhoRN2kcQ9",
7
+ "eval_id": "XzoqBoXfcfEbgigJPVXSAc",
8
+ "run_id": "T5i3RtsGq7GzUYf4bhV9A2",
9
+ "created": "2025-12-16T12:28:27+00:00",
10
+ "task": "hf/OpenEvals/SimpleQA",
11
+ "task_id": "XFAsiMeTjZD9msWpDbxxHC",
12
+ "task_version": 0,
13
+ "task_display_name": "hf/OpenEvals/SimpleQA",
14
+ "task_attribs": {},
15
+ "task_args": {},
16
+ "task_args_passed": {},
17
+ "dataset": {
18
+ "name": "OpenEvals/SimpleQA",
19
+ "location": "OpenEvals/SimpleQA",
20
+ "samples": 4321,
21
+ "sample_ids": [
22
+ 1,
23
+ 2
24
+ ],
25
+ "shuffled": false
26
+ },
27
+ "model": "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct",
28
+ "model_generate_config": {},
29
+ "model_base_url": "https://router.huggingface.co/v1",
30
+ "model_args": {},
31
+ "model_roles": {},
32
+ "config": {
33
+ "limit": 2,
34
+ "epochs": 1,
35
+ "epochs_reducer": [
36
+ "mean"
37
+ ],
38
+ "fail_on_error": true,
39
+ "continue_on_fail": false,
40
+ "max_tasks": 4,
41
+ "sandbox_cleanup": true,
42
+ "log_samples": true,
43
+ "log_realtime": true,
44
+ "log_images": true,
45
+ "log_shared": 0,
46
+ "score_display": true
47
+ },
48
+ "revision": {
49
+ "type": "git",
50
+ "origin": "[email protected]:NathanHB/inspect_ai.git",
51
+ "commit": "92f0b800",
52
+ "dirty": true
53
+ },
54
+ "packages": {
55
+ "inspect_ai": "0.1.dev3944+g92f0b8008.d20251215"
56
+ },
57
+ "scorers": [
58
+ {
59
+ "name": "model_graded_fact",
60
+ "options": {},
61
+ "metrics": [
62
+ {
63
+ "name": "inspect_ai/accuracy",
64
+ "options": {}
65
+ },
66
+ {
67
+ "name": "inspect_ai/stderr",
68
+ "options": {}
69
+ }
70
+ ],
71
+ "metadata": {}
72
+ }
73
+ ]
74
+ },
75
+ "plan": {
76
+ "name": "plan",
77
+ "steps": [
78
+ {
79
+ "solver": "generate",
80
+ "params": {}
81
+ }
82
+ ],
83
+ "config": {}
84
+ },
85
+ "results": {
86
+ "total_samples": 2,
87
+ "completed_samples": 2,
88
+ "scores": [
89
+ {
90
+ "name": "model_graded_fact",
91
+ "scorer": "model_graded_fact",
92
+ "scored_samples": 2,
93
+ "unscored_samples": 0,
94
+ "params": {},
95
+ "metrics": {
96
+ "accuracy": {
97
+ "name": "accuracy",
98
+ "value": 0.0,
99
+ "params": {}
100
+ },
101
+ "stderr": {
102
+ "name": "stderr",
103
+ "value": 0.0,
104
+ "params": {}
105
+ }
106
+ }
107
+ }
108
+ ]
109
+ },
110
+ "stats": {
111
+ "started_at": "2025-12-16T12:28:27+00:00",
112
+ "completed_at": "2025-12-16T12:28:28+00:00",
113
+ "model_usage": {
114
+ "hf-inference-providers/meta-llama/Llama-3.1-8B-Instruct": {
115
+ "input_tokens": 687,
116
+ "output_tokens": 507,
117
+ "total_tokens": 1194,
118
+ "input_tokens_cache_read": 0
119
+ }
120
+ }
121
+ },
122
+ "invalidated": false,
123
+ "reductions": [
124
+ {
125
+ "scorer": "model_graded_fact",
126
+ "samples": [
127
+ {
128
+ "value": 0,
129
+ "answer": "I do not have information about who received the IEEE Frank Rosenblatt Award in 2010.",
130
+ "explanation": "To determine the grade, I will follow these steps:\n\n1. Extract the main content from both the expert answer and the submission. The expert answer contains: 'Michio Sugeno'. The submission states: 'I do not have information about who received the IEEE Frank Rosenblatt Award in 2010.'\n\n2. Compare the extracted content from the expert answer and the submission. The submission does not include any information about the recipient of the IEEE Frank Rosenblatt Award, nor does it mention Michio Sugeno by name.\n\n3. Assess whether the submission contains the content in the expert answer. Since the submission does not mention the IEEE Frank Rosenblatt Award recipient or Michio Sugeno, it does not contain the required information.\n\n4. Based on the comparison, since the submission does not match the content of the expert answer, it should be classified as an incorrect response.\n\n5. Determine the grade accordingly: Since the submitted answer is incorrect, it should be 'I'.\n\nGRADE: I",
131
+ "metadata": {
132
+ "grading": [
133
+ {
134
+ "id": "NBsiLqWKcmjWSL5MJbU8xZ",
135
+ "content": "\nYou are comparing a submitted answer to an expert answer on a given question. Here is the data:\n\n[BEGIN DATA]\n************\n[Question]: Who received the IEEE Frank Rosenblatt Award in 2010?\n************\n[Expert]: Michio Sugeno\n************\n[Submission]: I do not have information about who received the IEEE Frank Rosenblatt Award in 2010.\n************\n[END DATA]\n\nCompare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.\n\nDoes the submission contain the content in the expert answer?\n\n\nAfter assessing the submitted answer, reply with 'GRADE: $LETTER' (without quotes) where LETTER is one of CI. Please choose ONE option for the grade: either \"C\" for correct answers, or \"I\" for incorrect answers.\n\nFor example, after reviewing a correct answer you might write 'GRADE: C' or after reviewing an incorrect answer you might write 'GRADE: I'.\n\nFirst, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then, end with your answer formatted as 'GRADE: $LETTER' (without quotes) where LETTER is one of CI.\n\n",
136
+ "role": "user"
137
+ },
138
+ {
139
+ "id": "boAun88dWioKvWcMJUtALL",
140
+ "content": "To determine the grade, I will follow these steps:\n\n1. Extract the main content from both the expert answer and the submission. The expert answer contains: 'Michio Sugeno'. The submission states: 'I do not have information about who received the IEEE Frank Rosenblatt Award in 2010.'\n\n2. Compare the extracted content from the expert answer and the submission. The submission does not include any information about the recipient of the IEEE Frank Rosenblatt Award, nor does it mention Michio Sugeno by name.\n\n3. Assess whether the submission contains the content in the expert answer. Since the submission does not mention the IEEE Frank Rosenblatt Award recipient or Michio Sugeno, it does not contain the required information.\n\n4. Based on the comparison, since the submission does not match the content of the expert answer, it should be classified as an incorrect response.\n\n5. Determine the grade accordingly: Since the submitted answer is incorrect, it should be 'I'.\n\nGRADE: I",
141
+ "source": "generate",
142
+ "role": "assistant",
143
+ "model": "llama3.1-8b"
144
+ }
145
+ ]
146
+ },
147
+ "history": [],
148
+ "sample_id": 1
149
+ },
150
+ {
151
+ "value": 0,
152
+ "answer": "I am unable to verify who received the award in 2018.",
153
+ "explanation": "To compare the factual content of the submitted answer with the expert answer, I will follow these steps:\n\n1. Identify the key information from the expert answer, which is 'Annick Bricaud'. This is the specific data that needs to be present or absent in the submission.\n\n2. Evaluate the submission by looking for the phrase or the name 'Annick Bricaud'. \n\nIf the submission contains the name 'Annick Bricaud', it means that the submission has the same factual content as the expert answer.\n\nIf the submission does not contain the name 'Annick Bricaud' or mentions any other information, it means that the submission lacks the key information from the expert answer or provides incorrect information.\n\n3. Compare the provided submission with the information identified in step 1. According to the submission, it is unable to verify who received the award.\n\n4. Since the submission does not contain the name 'Annick Bricaud', it lacks the key information from the expert answer.\n\n5. Based on the comparison, the submission fails to meet the requirements for factuality, as it contains no actual answer and does not match the name given in the expert answer.\n\n6. The conclusion based on the assessment of the submitted answer shows that it contains incorrect information; hence it should be graded as incorrect or incomplete.\n\nGRADE: I",
154
+ "metadata": {
155
+ "grading": [
156
+ {
157
+ "id": "P5hziaKzQieyyXNxvPhvLC",
158
+ "content": "\nYou are comparing a submitted answer to an expert answer on a given question. Here is the data:\n\n[BEGIN DATA]\n************\n[Question]: Who was awarded the Oceanography Society's Jerlov Award in 2018?\n************\n[Expert]: Annick Bricaud\n************\n[Submission]: I am unable to verify who received the award in 2018.\n************\n[END DATA]\n\nCompare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation.\n\nDoes the submission contain the content in the expert answer?\n\n\nAfter assessing the submitted answer, reply with 'GRADE: $LETTER' (without quotes) where LETTER is one of CI. Please choose ONE option for the grade: either \"C\" for correct answers, or \"I\" for incorrect answers.\n\nFor example, after reviewing a correct answer you might write 'GRADE: C' or after reviewing an incorrect answer you might write 'GRADE: I'.\n\nFirst, write out in a step by step manner your reasoning about the criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then, end with your answer formatted as 'GRADE: $LETTER' (without quotes) where LETTER is one of CI.\n\n",
159
+ "role": "user"
160
+ },
161
+ {
162
+ "id": "iveGWXYKYCn4jgs77WhnF9",
163
+ "content": "To compare the factual content of the submitted answer with the expert answer, I will follow these steps:\n\n1. Identify the key information from the expert answer, which is 'Annick Bricaud'. This is the specific data that needs to be present or absent in the submission.\n\n2. Evaluate the submission by looking for the phrase or the name 'Annick Bricaud'. \n\nIf the submission contains the name 'Annick Bricaud', it means that the submission has the same factual content as the expert answer.\n\nIf the submission does not contain the name 'Annick Bricaud' or mentions any other information, it means that the submission lacks the key information from the expert answer or provides incorrect information.\n\n3. Compare the provided submission with the information identified in step 1. According to the submission, it is unable to verify who received the award.\n\n4. Since the submission does not contain the name 'Annick Bricaud', it lacks the key information from the expert answer.\n\n5. Based on the comparison, the submission fails to meet the requirements for factuality, as it contains no actual answer and does not match the name given in the expert answer.\n\n6. The conclusion based on the assessment of the submitted answer shows that it contains incorrect information; hence it should be graded as incorrect or incomplete.\n\nGRADE: I",
164
+ "source": "generate",
165
+ "role": "assistant",
166
+ "model": "llama3.1-8b"
167
+ }
168
+ ]
169
+ },
170
+ "history": [],
171
+ "sample_id": 2
172
+ }
173
+ ]
174
+ }
175
+ ]
176
+ }
177
+ }