[ { "Model": "o1", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 94.4, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 89.2, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 95.8, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 90.6, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 86.2, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 87.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 93.4, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 88.0, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 94.6, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 85.0, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 97.8, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 90.6, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 89.8, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 85.0, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 78.2, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 88.4, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 82.4, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 83.6, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 65.2, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 91.4, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 86.0, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 90.8, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 85.8, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 85.8, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 98.2, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 96.2, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 91.4, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 97.6, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 96.8, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 91.0, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 81.2, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 97.0, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 88.0, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 89.6, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 80.0, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 83.0, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 49.4, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 94.4, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 87.6, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 69.4, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 78.6, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 60.0, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 51.6, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 79.8, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 65.6, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 80.0, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 65.8, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Direct Fact Ver.", "Score": 64.8, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 96.2, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 94.6, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 97.4, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 94.4, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 93.0, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 98.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 97.2, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 91.4, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 95.4, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 86.6, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 97.2, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 91.4, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 95.0, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 91.6, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 90.2, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 95.8, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 89.0, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 87.6, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 89.8, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 91.0, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 89.2, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 90.0, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 89.0, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Ver. of Assertion", "Score": 88.4, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 95.4, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 93.8, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 97.4, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 96.8, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 97.8, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 98.8, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 98.0, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 90.0, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 95.4, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 87.2, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 96.6, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 94.4, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 95.4, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 94.0, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 92.2, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 95.4, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 92.8, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 92.4, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 93.8, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 89.6, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 86.0, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 89.0, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 85.8, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Knowledge", "Score": 85.6, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 93.8, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 85.8, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 94.0, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 86.4, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 83.8, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 89.2, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 90.6, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 88.6, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 93.6, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 86.4, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 95.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 90.2, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 89.8, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 80.2, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 74.8, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 84.8, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 81.4, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 81.6, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 83.8, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 85.6, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 79.8, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 85.0, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 80.8, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 80.4, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 97.6, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 96.2, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 93.4, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 98.2, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 97.0, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 88.2, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 82.4, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 96.6, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 92.0, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 89.4, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 88.2, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 88.2, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 62.2, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 94.4, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 87.4, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 69.2, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 80.8, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 62.4, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 30.4, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 87.4, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 75.4, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 87.4, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 72.8, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Ver. of 1P Belief", "Score": 72.8, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 99.6, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 98.0, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 98.2, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 98.6, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 99.0, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 99.6, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 99.8, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 90.4, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 96.2, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 86.8, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 100.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 93.4, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 94.8, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 89.0, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 94.0, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 93.4, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 84.2, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 89.4, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 82.2, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 96.0, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 91.0, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 95.4, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 90.2, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 91.2, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 83.8, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 66.6, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 64.4, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 67.8, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 69.0, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 87.6, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 92.4, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 14.4, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 29.6, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 18.4, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 94.2, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 22.0, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 51.0, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 45.6, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 54.8, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 50.0, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 18.8, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 44.8, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 66.8, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 83.2, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 55.6, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 77.2, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 57.0, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Conf. of 1P Belief", "Score": 55.8, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 97.2, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 93.8, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 98.4, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 96.8, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 95.0, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 97.8, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 99.0, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 89.2, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 92.8, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 85.4, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 98.6, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 93.0, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 93.2, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 96.2, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 93.8, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 86.0, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 81.6, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 83.6, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 75.4, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 93.6, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 81.6, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 91.8, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 82.2, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 83.2, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 27.4, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 50.6, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 57.2, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 39.2, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 50.0, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 63.0, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 84.6, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 18.2, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 16.2, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 19.0, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 63.6, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 17.6, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 46.2, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 55.8, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 46.8, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 34.2, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 19.2, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 44.6, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 58.4, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 58.2, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 41.2, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 56.2, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 41.6, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Intrsp. of 1P Belief", "Score": 43.0, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 100.0, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 100.0, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.0, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.8, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.8, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 100.0, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.2, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.6, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 97.8, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 100.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 98.4, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 95.6, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 96.6, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 97.2, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 97.6, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 98.0, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 87.2, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 92.4, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 96.2, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 93.2, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 96.2, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 93.6, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 93.6, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.2, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.6, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 87.4, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 98.4, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 97.2, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.0, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 94.6, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 94.2, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 96.4, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 79.6, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 99.6, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 74.0, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 62.4, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 87.2, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 86.0, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 76.2, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 83.6, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 55.0, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 84.6, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 88.6, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 79.6, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 87.6, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 79.6, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (J)", "Score": 79.8, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 100.0, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 100.0, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 98.8, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.8, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.6, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 98.2, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.8, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.2, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 100.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 98.4, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 95.0, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 96.6, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 97.4, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 97.0, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 97.8, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 87.8, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 87.4, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 96.6, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 93.4, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 96.0, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 93.6, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 93.6, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 98.2, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.4, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 87.0, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.2, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 97.8, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.6, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 94.2, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 90.6, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 98.0, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 86.6, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 99.8, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 77.6, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 63.6, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 89.4, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 88.0, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 75.4, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 86.2, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 55.8, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 76.6, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 90.2, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 79.0, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 89.4, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 79.0, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Conf. of 3P Belief (M)", "Score": 79.0, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 99.8, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 99.2, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 99.0, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 95.2, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 96.6, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 97.8, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 98.8, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 98.4, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 97.0, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 90.8, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 96.6, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 96.2, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 96.0, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 96.0, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 96.0, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 92.6, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 99.8, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 100.0, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 94.6, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 79.2, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 91.4, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 92.8, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 93.0, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 95.6, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 91.8, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 84.8, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 93.6, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 93.6, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 92.8, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 92.8, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (JM)", "Score": 93.0, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 99.4, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 99.8, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 98.6, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 96.6, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 97.0, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 97.8, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 98.0, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 98.0, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 95.6, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 35.0, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 96.6, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 95.0, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 96.0, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 94.8, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 95.2, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 93.4, "Condition": "False" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 99.8, "Condition": "False" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 99.6, "Condition": "False" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 100.0, "Condition": "False" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 94.0, "Condition": "False" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 84.8, "Condition": "False" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 91.4, "Condition": "False" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 93.0, "Condition": "False" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 93.0, "Condition": "False" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 95.0, "Condition": "False" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 90.4, "Condition": "False" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 26.2, "Condition": "False" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 93.6, "Condition": "False" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 88.8, "Condition": "False" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 92.8, "Condition": "False" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 88.6, "Condition": "False" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Corr. Attrib. of Belief (MJ)", "Score": 88.6, "Condition": "False" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 96.0, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 93.6, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 95.0, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 34.2, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 35.8, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 97.2, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 91.8, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 90.6, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 94.2, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 81.6, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 96.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 88.4, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 94.8, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 66.4, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 30.6, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 87.0, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 93.2, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 90.8, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 89.2, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 81.8, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 82.8, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 79.4, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 81.2, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Ver. of Rec. Knowledge", "Score": 80.2, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 99.4, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 99.4, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 98.6, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 90.6, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 96.6, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 78.8, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 95.0, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 97.6, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 83.0, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 62.2, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 96.4, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 69.6, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 96.2, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 68.6, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Conf. of Rec. Knowledge", "Score": 68.6, "Condition": "True" }, { "Model": "o1", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 99.8, "Condition": "True" }, { "Model": "o3-mini", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 98.4, "Condition": "True" }, { "Model": "4o", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 99.6, "Condition": "True" }, { "Model": "3.7 Sonnet", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 96.0, "Condition": "True" }, { "Model": "3.5 Sonnet", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "Gemini 2 Flash-Lite", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 100.0, "Condition": "True" }, { "Model": "R1", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 83.2, "Condition": "True" }, { "Model": "R1-Dist.-Llama-70B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 93.6, "Condition": "True" }, { "Model": "R1-Dist.Qwen-14B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 88.2, "Condition": "True" }, { "Model": "Llama 3.3-70B Inst. Turbo", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 97.0, "Condition": "True" }, { "Model": "GPT-4", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 98.6, "Condition": "True" }, { "Model": "GPT-3.5", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 65.6, "Condition": "True" }, { "Model": "3 Opus", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 97.2, "Condition": "True" }, { "Model": "3 Sonnet", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 98.4, "Condition": "True" }, { "Model": "3 Haiku", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 74.6, "Condition": "True" }, { "Model": "Mixtral 8x22B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 94.4, "Condition": "True" }, { "Model": "Mixtral 8x7B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 88.2, "Condition": "True" }, { "Model": "Mixtral 7B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 96.0, "Condition": "True" }, { "Model": "Llama-3 70B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 96.8, "Condition": "True" }, { "Model": "Llama-3 8B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 80.4, "Condition": "True" }, { "Model": "Llama-2 70B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 96.8, "Condition": "True" }, { "Model": "Llama-2 13B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 79.0, "Condition": "True" }, { "Model": "Llama-2 7B", "Type": "Large Language Model", "Dataset": "Awrn. of Rec. Knowledge", "Score": 80.0, "Condition": "True" } ]