/
OS-World75fd8c0import os
import argparse
def get_result(action_space, use_model, observation_type, result_dir, show_detailed_scores=False):
"""
Calculate and display evaluation results from OSWorld benchmark runs.
Args:
action_space (str): Action space used (e.g., "pyautogui", "computer_13")
use_model (str): Model name used for evaluation (e.g., "gpt-4o", "claude-3")
observation_type (str): Observation type used (e.g., "screenshot", "a11y_tree")
result_dir (str): Root directory containing results
show_detailed_scores (bool): If True, show detailed scores per domain in format "score/total"
Returns:
list: List of all individual task results, or None if no results found
"""
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
if not os.path.exists(target_dir):
print("New experiment, no result yet.")
return None
all_result = []
domain_result = {}
all_result_for_analysis = {}
for domain in os.listdir(target_dir):
domain_path = os.path.join(target_dir, domain)
if os.path.isdir(domain_path):
for example_id in os.listdir(domain_path):
example_path = os.path.join(domain_path, example_id)
if os.path.isdir(example_path):
if "result.txt" in os.listdir(example_path):
if domain not in domain_result:
domain_result[domain] = []
result = open(os.path.join(example_path, "result.txt"), "r").read()
try:
domain_result[domain].append(float(result))
except:
domain_result[domain].append(float(eval(result)))
if domain not in all_result_for_analysis:
all_result_for_analysis[domain] = {}
all_result_for_analysis[domain][example_id] = domain_result[domain][-1]
try:
result = open(os.path.join(example_path, "result.txt"), "r").read()
try:
all_result.append(float(result))
except:
all_result.append(float(bool(result)))
except:
all_result.append(0.0)
if show_detailed_scores:
# Print detailed scores in format "score/total" for each domain
result_order = ["chrome", "gimp", "libreoffice_calc", "libreoffice_impress",
"libreoffice_writer", "multi_apps", "os", "thunderbird", "vlc", "vs_code"]
output_row = []
for d in result_order:
if d in domain_result:
output_row.append(f"{round(sum(domain_result[d]),2)}/{len(domain_result[d])}")
else:
output_row.append("0.00/0")
print(" ".join(output_row))
else:
# Print standard per-domain statistics
for domain in domain_result:
print("Domain:", domain, "Runned:", len(domain_result[domain]), "Success Rate:",
sum(domain_result[domain]) / len(domain_result[domain]) * 100, "%")
print(">>>>>>>>>>>>>")
# Print category-level statistics
if all(d in domain_result for d in ["libreoffice_calc", "libreoffice_impress", "libreoffice_writer"]):
print("Office", "Success Rate:", sum(
domain_result["libreoffice_calc"] + domain_result["libreoffice_impress"] + domain_result[
"libreoffice_writer"]) / len(
domain_result["libreoffice_calc"] + domain_result["libreoffice_impress"] + domain_result[
"libreoffice_writer"]) * 100, "%")
if all(d in domain_result for d in ["vlc", "thunderbird", "chrome"]):
print("Daily", "Success Rate:",
sum(domain_result["vlc"] + domain_result["thunderbird"] + domain_result["chrome"]) / len(
domain_result["vlc"] + domain_result["thunderbird"] + domain_result["chrome"]) * 100, "%")
if all(d in domain_result for d in ["gimp", "vs_code"]):
print("Professional", "Success Rate:", sum(domain_result["gimp"] + domain_result["vs_code"]) / len(
domain_result["gimp"] + domain_result["vs_code"]) * 100, "%")
with open(os.path.join(target_dir, "all_result.json"), "w") as f:
f.write(str(all_result_for_analysis))
if not all_result:
print("New experiment, no result yet.")
return None
else:
print("Runned:", len(all_result), "Current Success Rate:",
round(sum(all_result) / len(all_result) * 100, 2), "%",
f"{round(sum(all_result), 2)}", "/", str(len(all_result)))
return all_result
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Calculate and display OSWorld evaluation results"
)
parser.add_argument(
"--action_space",
type=str,
default="pyautogui",
help="Action space used (e.g., 'pyautogui', 'computer_13')"
)
parser.add_argument(
"--model",
type=str,
default="gpt-4o",
help="Model name used for evaluation (e.g., 'gpt-4o', 'claude-3')"
)
parser.add_argument(
"--observation_type",
type=str,
default="screenshot",
help="Observation type used (e.g., 'screenshot', 'a11y_tree', 'som')"
)
parser.add_argument(
"--result_dir",
type=str,
default="./results",
help="Root directory containing results (default: ./results)"
)
parser.add_argument(
"--detailed",
action="store_true",
help="Show detailed scores per domain in format 'score/total'"
)
args = parser.parse_args()
get_result(
args.action_space,
args.model,
args.observation_type,
args.result_dir,
args.detailed
)