TomTraining · moshesbeta · Apr 28, 2026
diff --git a/tables/Belief_R/DeepSeek-V3.2/config.json b/tables/Belief_R/DeepSeek-V3.2/config.json
@@ -0,0 +1,37 @@
+{
+  "dataset": "Belief_R",
+  "model": "deepseek-v3.2",
+  "prompt_method": "ZS_vanilla",
+  "repeats": 1,
+  "dataset_config": {
+    "dataset": "Belief_R",
+    "path": "Belief_R/test",
+    "method": "ZS_vanilla",
+    "schema": "MCQAnswer3Lower",
+    "system_prompt": "",
+    "use_llm_judge": null
+  },
+  "experiment_config": {
+    "llm_config": {
+      "model_name": "deepseek-v3.2",
+      "temperature": 0.6,
+      "max_tokens": 32768,
+      "max_workers": 32,
+      "enable_thinking": true,
+      "system_prompt": ""
+    },
+    "repeats": 1,
+    "max_samples": 0,
+    "datasets_path": "datasets",
+    "results_path": "results",
+    "judge_config": {
+      "model_name": "deepseek-v3.2",
+      "temperature": 0.0,
+      "max_tokens": 4096,
+      "system_prompt": "",
+      "enable_thinking": true,
+      "use_llm_judge": false
+    }
+  },
+  "exp_dir": "exp_20260427_105527"
+}
diff --git a/tables/Belief_R/GLM-5-Think/config.json b/tables/Belief_R/GLM-5-Think/config.json
@@ -0,0 +1,37 @@
+{
+  "dataset": "Belief_R",
+  "model": "glm-5",
+  "prompt_method": "ZS_vanilla",
+  "repeats": 1,
+  "dataset_config": {
+    "dataset": "Belief_R",
+    "path": "Belief_R/test",
+    "method": "ZS_vanilla",
+    "schema": "MCQAnswer3Lower",
+    "system_prompt": "",
+    "use_llm_judge": null
+  },
+  "experiment_config": {
+    "llm_config": {
+      "model_name": "glm-5",
+      "temperature": 0.6,
+      "max_tokens": 32768,
+      "max_workers": 16,
+      "enable_thinking": true,
+      "system_prompt": ""
+    },
+    "repeats": 1,
+    "max_samples": 0,
+    "datasets_path": "datasets",
+    "results_path": "results",
+    "judge_config": {
+      "model_name": "glm-5",
+      "temperature": 0.0,
+      "max_tokens": 4096,
+      "system_prompt": "",
+      "enable_thinking": true,
+      "use_llm_judge": false
+    }
+  },
+  "exp_dir": "exp_20260423_151546"
+}
diff --git a/tables/Belief_R/Gemini-3.1-Pro-Preview/config.json b/tables/Belief_R/Gemini-3.1-Pro-Preview/config.json
@@ -0,0 +1,37 @@
+{
+  "dataset": "Belief_R",
+  "model": "gemini-3.1-pro-preview",
+  "prompt_method": "ZS_vanilla",
+  "repeats": 1,
+  "dataset_config": {
+    "dataset": "Belief_R",
+    "path": "Belief_R/test",
+    "method": "ZS_vanilla",
+    "schema": "MCQAnswer3Lower",
+    "system_prompt": "",
+    "use_llm_judge": null
+  },
+  "experiment_config": {
+    "llm_config": {
+      "model_name": "gemini-3.1-pro-preview",
+      "temperature": 0.6,
+      "max_tokens": 32768,
+      "max_workers": 8,
+      "enable_thinking": true,
+      "system_prompt": ""
+    },
+    "repeats": 1,
+    "max_samples": 0,
+    "datasets_path": "datasets",
+    "results_path": "results",
+    "judge_config": {
+      "model_name": "deepseek-r1",
+      "temperature": 0.0,
+      "max_tokens": 4096,
+      "system_prompt": "",
+      "enable_thinking": true,
+      "use_llm_judge": false
+    }
+  },
+  "exp_dir": "exp_20260428_101316"
+}
diff --git a/tables/Belief_R/Qwen3.5-9B-NoThink/config.json b/tables/Belief_R/Qwen3.5-9B-NoThink/config.json
@@ -17,7 +17,7 @@
       "temperature": 0.6,
       "max_tokens": 32768,
       "max_workers": 64,
-      "enable_thinking": false,
+      "enable_thinking": true,
       "system_prompt": ""
     },
     "repeats": 3,
@@ -29,9 +29,9 @@
       "temperature": 0.0,
       "max_tokens": 4096,
       "system_prompt": "",
-      "enable_thinking": false,
+      "enable_thinking": true,
       "use_llm_judge": false
     }
   },
-  "exp_dir": "exp_20260422_145022"
+  "exp_dir": "exp_20260422_200733"
 }
diff --git a/tables/Belief_R/Qwen3.5-9B-Think/config.json b/tables/Belief_R/Qwen3.5-9B-Think/config.json
@@ -0,0 +1,37 @@
+{
+  "dataset": "Belief_R",
+  "model": "Qwen3.5-9B",
+  "prompt_method": "ZS_vanilla",
+  "repeats": 3,
+  "dataset_config": {
+    "dataset": "Belief_R",
+    "path": "Belief_R/test",
+    "method": "ZS_vanilla",
+    "schema": "MCQAnswer3Lower",
+    "system_prompt": "",
+    "use_llm_judge": null
+  },
+  "experiment_config": {
+    "llm_config": {
+      "model_name": "Qwen3.5-9B",
+      "temperature": 0.6,
+      "max_tokens": 32768,
+      "max_workers": 64,
+      "enable_thinking": true,
+      "system_prompt": ""
+    },
+    "repeats": 3,
+    "max_samples": 0,
+    "datasets_path": "datasets",
+    "results_path": "results",
+    "judge_config": {
+      "model_name": "Qwen3.5-9B",
+      "temperature": 0.0,
+      "max_tokens": 4096,
+      "system_prompt": "",
+      "enable_thinking": true,
+      "use_llm_judge": false
+    }
+  },
+  "exp_dir": "exp_20260422_200733"
+}
diff --git a/tables/Belief_R/Qwen3.5-Plus-Think/config.json b/tables/Belief_R/Qwen3.5-Plus-Think/config.json
@@ -0,0 +1,37 @@
+{
+  "dataset": "Belief_R",
+  "model": "qwen3.5-plus",
+  "prompt_method": "ZS_vanilla",
+  "repeats": 1,
+  "dataset_config": {
+    "dataset": "Belief_R",
+    "path": "Belief_R/test",
+    "method": "ZS_vanilla",
+    "schema": "MCQAnswer3Lower",
+    "system_prompt": "",
+    "use_llm_judge": null
+  },
+  "experiment_config": {
+    "llm_config": {
+      "model_name": "qwen3.5-plus",
+      "temperature": 0.6,
+      "max_tokens": 32768,
+      "max_workers": 32,
+      "enable_thinking": true,
+      "system_prompt": ""
+    },
+    "repeats": 1,
+    "max_samples": 0,
+    "datasets_path": "datasets",
+    "results_path": "results",
+    "judge_config": {
+      "model_name": "qwen3.5-plus",
+      "temperature": 0.0,
+      "max_tokens": 4096,
+      "system_prompt": "",
+      "enable_thinking": true,
+      "use_llm_judge": false
+    }
+  },
+  "exp_dir": "exp_20260427_122136"
+}
diff --git a/tables/Belief_R/其他指标.md b/tables/Belief_R/其他指标.md
@@ -2,12 +2,12 @@
 
 ## 标量指标
 
-| 指标 \ 模型 | DeepSeek-R1-Think | Qwen3-8B-Think | Qwen3.5-9B-NoThink |
-|---|---|---|---|
-| BM-Acc | 0.9487 | 0.9876 | 0.8213 |
-| BREU | 0.7083 | 0.6887 | 0.6850 |
-| BU-Acc | 0.4679 | 0.3897 | 0.5487 |
-| bm_correct | 1814.0000 | 1888.3333 | 1570.3333 |
-| bm_total | 1912.0000 | 1912.0000 | 1912.0000 |
-| bu_correct | 816.0000 | 679.6667 | 957.0000 |
-| bu_total | 1744.0000 | 1744.0000 | 1744.0000 |
+| 指标 \ 模型 | DeepSeek-R1-Think | Qwen3-8B-Think | Qwen3.5-9B-NoThink | Qwen3.5-9B-Think | deepseek-v3.2 | Qwen3.5-Plus-Think | GLM-5-Think | Gemini-3.1-Pro-Preview |
+|---|---|---|---|---|---|---|---|---|
+| BM-Acc | 0.9487 | 0.9876 | 0.8213 | 0.9916 | 0.9702 | 0.9922 | 0.9911 | 0.9953 |
+| BREU | 0.7083 | 0.6887 | 0.6850 | 0.6896 | 0.7021 | 0.6867 | 0.6885 | 0.6889 |
+| BU-Acc | 0.4679 | 0.3897 | 0.5487 | 0.3876 | 0.4341 | 0.3813 | 0.3859 | 0.3825 |
+| bm_correct | 1814.0000 | 1888.3333 | 1570.3333 | 1896.0000 | 1855.0000 | 1897.0000 | 1895.0000 | 1903.0000 |
+| bm_total | 1912.0000 | 1912.0000 | 1912.0000 | 1912.0000 | 1912.0000 | 1912.0000 | 1912.0000 | 1912.0000 |
+| bu_correct | 816.0000 | 679.6667 | 957.0000 | 676.0000 | 757.0000 | 665.0000 | 673.0000 | 667.0000 |
+| bu_total | 1744.0000 | 1744.0000 | 1744.0000 | 1744.0000 | 1744.0000 | 1744.0000 | 1744.0000 | 1744.0000 |
diff --git a/tables/Belief_R/基础指标.md b/tables/Belief_R/基础指标.md
@@ -1,7 +1,7 @@
 # Belief_R - 基础指标
 
-| 指标 \ 模型 | DeepSeek-R1-Think | Qwen3-8B-Think | Qwen3.5-9B-NoThink |
-|---|---|---|---|
-| accuracy | 0.7194 | 0.7024 | 0.6913 |
-| correct | 2630.0000 | 2568.0000 | 2527.3333 |
-| total | 3656.0000 | 3656.0000 | 3656.0000 |
+| 指标 \ 模型 | DeepSeek-R1-Think | Qwen3-8B-Think | Qwen3.5-9B-NoThink | Qwen3.5-9B-Think | DeepSeek-V3.2 | Qwen3.5-Plus-Think | GLM-5-Think | Gemini-3.1-Pro-Preview |
+|---|---|---|---|---|---|---|---|---|
+| accuracy | 0.7194 | 0.7024 | 0.6913 | 0.7035 | 0.7144 | 0.7008 | 0.7024 | 0.7030 |
+| correct | 2630.0000 | 2568.0000 | 2527.3333 | 2572.0000 | 2612.0000 | 2562.0000 | 2568.0000 | 2570.0000 |
+| total | 3656.0000 | 3656.0000 | 3656.0000 | 3656.0000 | 3656.0000 | 3656.0000 | 3656.0000 | 3656.0000 |
diff --git a/tables/FANToM/DeepSeek-R1-Think/config.json b/tables/FANToM/DeepSeek-R1-Think/config.json
@@ -0,0 +1,41 @@
+{
+  "dataset": "FANToM",
+  "model": "deepseek-r1",
+  "prompt_method": "ZS_vanilla",
+  "repeats": 1,
+  "dataset_config": {
+    "dataset": "FANToM",
+    "path": "FanToM/test",
+    "method": "ZS_vanilla",
+    "schema": [
+      "OpenAnswer",
+      "MCQAnswer2",
+      "MultiLabelAnswer"
+    ],
+    "system_prompt": "",
+    "use_llm_judge": true
+  },
+  "experiment_config": {
+    "llm_config": {
+      "model_name": "deepseek-r1",
+      "temperature": 0.6,
+      "max_tokens": 32768,
+      "max_workers": 16,
+      "enable_thinking": true,
+      "system_prompt": ""
+    },
+    "repeats": 1,
+    "max_samples": 0,
+    "datasets_path": "datasets",
+    "results_path": "results",
+    "judge_config": {
+      "model_name": "deepseek-r1",
+      "temperature": 0.0,
+      "max_tokens": 4096,
+      "system_prompt": "",
+      "enable_thinking": true,
+      "use_llm_judge": false
+    }
+  },
+  "exp_dir": "exp_20260428_091549"
+}
diff --git a/tables/FANToM/DeepSeek-V3.2/config.json b/tables/FANToM/DeepSeek-V3.2/config.json
@@ -0,0 +1,41 @@
+{
+  "dataset": "FANToM",
+  "model": "deepseek-v3.2",
+  "prompt_method": "ZS_vanilla",
+  "repeats": 1,
+  "dataset_config": {
+    "dataset": "FANToM",
+    "path": "FanToM/test",
+    "method": "ZS_vanilla",
+    "schema": [
+      "OpenAnswer",
+      "MCQAnswer2",
+      "MultiLabelAnswer"
+    ],
+    "system_prompt": "",
+    "use_llm_judge": true
+  },
+  "experiment_config": {
+    "llm_config": {
+      "model_name": "deepseek-v3.2",
+      "temperature": 0.6,
+      "max_tokens": 32768,
+      "max_workers": 32,
+      "enable_thinking": true,
+      "system_prompt": ""
+    },
+    "repeats": 1,
+    "max_samples": 0,
+    "datasets_path": "datasets",
+    "results_path": "results",
+    "judge_config": {
+      "model_name": "deepseek-v3.2",
+      "temperature": 0.0,
+      "max_tokens": 4096,
+      "system_prompt": "",
+      "enable_thinking": true,
+      "use_llm_judge": false
+    }
+  },
+  "exp_dir": "exp_20260427_105527"
+}
diff --git a/tables/FANToM/其他指标.md b/tables/FANToM/其他指标.md
@@ -0,0 +1,22 @@
+# FANToM - 其他指标
+
+## by_category
+
+| 子指标 \ 模型 | deepseek-v3.2 | DeepSeek-R1-Think |
+|---|---|---|
+| answerability.ALL | 0.1437 | 0.1276 |
+| answerability.list.accuracy | 0.4310 | 0.4874 |
+| answerability.yn.accuracy | 0.6376 | 0.6040 |
+| answerability.yn.weighted_f1 | 0.6744 | 0.6037 |
+| belief.choice.accuracy | 0.6071 | 0.6390 |
+| belief.qa.accuracy | 0.3688 | 0.3701 |
+| belief.qa.token_f1 | 0.3618 | 0.3044 |
+| belief.qa.token_f1_when_correct | 0.4073 | 0.3559 |
+| fact.qa.accuracy | 0.8207 | 0.8908 |
+| fact.token_f1 | 0.5134 | 0.4833 |
+| infoaccess.ALL | 0.3448 | 0.3943 |
+| infoaccess.list.accuracy | 0.5678 | 0.5770 |
+| infoaccess.yn.accuracy | 0.7953 | 0.8572 |
+| infoaccess.yn.weighted_f1 | 0.8341 | 0.8590 |
+| overall.ALL | 0.0529 | 0.0437 |
+| overall.ALL_star | 0.0138 | 0.0126 |
diff --git a/tables/FANToM/基础指标.md b/tables/FANToM/基础指标.md
@@ -0,0 +1,7 @@
+# FANToM - 基础指标
+
+| 指标 \ 模型 | DeepSeek-V3.2 | DeepSeek-R1-Think |
+|---|---|---|
+| accuracy | 0.6393 | 0.6603 |
+| correct | 8203.0000 | 8473.0000 |
+| total | 12832.0000 | 12832.0000 |