EleutherAI
diff --git a/‎lm_eval/evaluator_utils.py‎
Lines changed: 16 additions & 0 deletions b/‎lm_eval/evaluator_utils.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎lm_eval/tasks/longbench/2wikimqa.yaml‎
Lines changed: 7 additions & 3 deletions b/‎lm_eval/tasks/longbench/2wikimqa.yaml‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎lm_eval/tasks/longbench/2wikimqa_e.yaml‎
Lines changed: 7 additions & 3 deletions b/‎lm_eval/tasks/longbench/2wikimqa_e.yaml‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎lm_eval/tasks/longbench/README.md‎
Lines changed: 16 additions & 11 deletions b/‎lm_eval/tasks/longbench/README.md‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎lm_eval/tasks/longbench/_generate_config.py‎
Lines changed: 1 addition & 1 deletion b/‎lm_eval/tasks/longbench/_generate_config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lm_eval/tasks/longbench/_longbench.yaml‎
Lines changed: 13 additions & 0 deletions b/‎lm_eval/tasks/longbench/_longbench.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎lm_eval/tasks/longbench/_longbench_code.yaml‎
Lines changed: 10 additions & 0 deletions b/‎lm_eval/tasks/longbench/_longbench_code.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lm_eval/tasks/longbench/_longbench_code_e.yaml‎
Lines changed: 10 additions & 0 deletions b/‎lm_eval/tasks/longbench/_longbench_code_e.yaml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lm_eval/tasks/longbench/_longbench_e.yaml‎
Lines changed: 11 additions & 0 deletions b/‎lm_eval/tasks/longbench/_longbench_e.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎lm_eval/tasks/longbench/_longbench_fewshot.yaml‎
Lines changed: 12 additions & 0 deletions b/‎lm_eval/tasks/longbench/_longbench_fewshot.yaml‎
Lines changed: 12 additions & 0 deletions
@@ -509,6 +509,22 @@ def consolidate_group_results(
                 group_metadata = group_config.get("metadata", None)
                 if group_metadata is not None:
                     versions[group_or_task] = group_metadata.get("version", None)
+
+            # Clean up duplicate score rows for subtasks that also report other metrics.
+            for task in task_list:
+                task_metrics = [
+                    key
+                    for key in results[task].keys()
+                    if "," in key and not key.startswith("score_stderr")
+                ]
+                score_metrics = [
+                    key for key in task_metrics if key.startswith("score,")
+                ]
+                if score_metrics and len(task_metrics) > len(score_metrics):
+                    for score_metric in score_metrics:
+                        results[task].pop(score_metric, None)
+                        stderr_key = score_metric.replace("score,", "score_stderr,")
+                        results[task].pop(stderr_key, None)
     # print(results)
     return results, versions, show_group_table, task_aggregation_list
 
 
@@ -1,21 +1,25 @@
 
 tag:
-  - longbench
+  - longbench_multi_tasks
+  - longbench_tasks
 task: longbench_2wikimqa
 dataset_path: Xnhyacinth/LongBench
 test_split: test
 dataset_name: 2wikimqa
 doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{question}}\nAnswer:"
 doc_to_target: '{{answers}}'
-process_results: !function metrics.get_qa_f1_score
+process_results: !function metrics.get_qa_f1_with_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: False
   until: []
 metric_list:
+  - metric: "score"
+    aggregation: mean
+    higher_is_better: True
   - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 4.0
+  version: 5.0
@@ -1,21 +1,25 @@
 
 tag:
-  - longbench_e
+  - longbench_multi_tasks_e
+  - longbench_tasks_e
 task: longbench_2wikimqa_e
 dataset_path: Xnhyacinth/LongBench
 test_split: test
 dataset_name: 2wikimqa_e
 doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{question}}\nAnswer:"
 doc_to_target: '{{answers}}'
-process_results: !function metrics.get_qa_f1_score
+process_results: !function metrics.get_qa_f1_with_score
 generation_kwargs:
   max_gen_toks: 32
   temperature: 1
   do_sample: False
   until: []
 metric_list:
+  - metric: "score"
+    aggregation: mean
+    higher_is_better: True
   - metric: "qa_f1_score"
     aggregation: mean
     higher_is_better: True
 metadata:
-  version: 4.0
+  version: 5.0
@@ -26,23 +26,28 @@ Homepage: `https://github.com/THUDM/LongBench`
     pages = "3119--3137",
 }
 ```
-### Notes
-
-#### Tasks without Chat Template (with add_bos_token=True but model dependent)
-
-The original implementation suggest not to use `chat_template` for these tasks (for instruct models):
-- longbench_lcc
-- longbench_repobench-p
-- longbench_samsum
-- longbench_trec
-- longbench_triviaqa
+> [!NOTE]
+> The original implementation suggest not to use `chat_template` for these tasks for instruct models (with add_bos_token=True but model dependent):
+> - longbench_fewshot
+>    - longbench_trec
+>    - longbench_triviaqa
+>    - longbench_samsum
+>    - longbench_lsht
+> - longbench_code
+>   - longbench_lcc
+>   - longbench_repobench-p
 
 
 ### Groups, Tags, and Tasks
 
 #### Groups
 
-[//]: # (* `group_name`: `Short description`)
+* `longbench_single`: Single-Document QA tasks requiring comprehension of individual documents
+* `longbench_multi`: Multi-Document QA tasks requiring information synthesis across multiple documents
+* `longbench_summarization`: Summarization tasks for long documents and conversations
+* `longbench_fewshot`: Few-shot learning tasks with in-context examples
+* `longbench_synthetic`: Synthetic tasks including passage retrieval and counting
+* `longbench_code`: Code completion tasks for long code contexts
 
 #### Tags
 
 
@@ -211,7 +211,7 @@ def parse_args():
             "generation_kwargs": generation_kwargs,
             "has_newline": has_newline,  # Add the flag to the template context
             "metric_list": metric_list,
-            "metadata": {"version": "4.0"},
+            "metadata": {"version": "5.0"},
         }
 
         # Render template
 
@@ -0,0 +1,13 @@
+group: longbench
+task:
+  - longbench_code
+  - longbench_fewshot
+  - longbench_multi
+  - longbench_single
+  - longbench_summarization
+  - longbench_synthetic
+aggregate_metric_list:
+  - metric: score
+    weight_by_size: False
+metadata:
+  version: 0.0
@@ -0,0 +1,10 @@
+group: longbench_code
+group_alias: "Code Completion"
+task:
+  - longbench_lcc
+  - longbench_repobench-p
+aggregate_metric_list:
+  - metric: score
+    weight_by_size: False
+metadata:
+  version: 0.0
@@ -0,0 +1,10 @@
+group: longbench_code_e
+group_alias: "Code Completion (LongBench-E)"
+task:
+  - longbench_lcc_e
+  - longbench_repobench-p_e
+aggregate_metric_list:
+  - metric: score
+    weight_by_size: False
+metadata:
+  version: 0.0
@@ -0,0 +1,11 @@
+group: longbench_e
+task:
+  - longbench_code_e
+  - longbench_fewshot_e
+  - longbench_multi_e
+  - longbench_single_e
+aggregate_metric_list:
+  - metric: score
+    weight_by_size: False
+metadata:
+  version: 0.0
@@ -0,0 +1,12 @@
+group: longbench_fewshot
+group_alias: "Few-shot Learning"
+task:
+  - longbench_trec
+  - longbench_triviaqa
+  - longbench_samsum
+  - longbench_lsht
+aggregate_metric_list:
+  - metric: score
+    weight_by_size: False
+metadata:
+  version: 0.0
Original file line number	Diff line number	Diff line change
`@@ -211,7 +211,7 @@ def parse_args():`
`211`	`211`	`"generation_kwargs": generation_kwargs,`
`212`	`212`	`"has_newline": has_newline, # Add the flag to the template context`
`213`	`213`	`"metric_list": metric_list,`
`214`		`- "metadata": {"version": "4.0"},`
	`214`	`+ "metadata": {"version": "5.0"},`
`215`	`215`	`}`
`216`	`216`
`217`	`217`	`# Render template`