Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions gui_agents/s1/core/ProceduralMemory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
class PROCEDURAL_MEMORY:
@staticmethod
def construct_worker_procedural_memory(agent_class):
procedural_memory = textwrap.dedent(
f"""\
procedural_memory = textwrap.dedent(f"""\
You are an expert in graphical user interfaces and Python code. You are responsible for executing the current subtask: `SUBTASK_DESCRIPTION` of the larger goal: `TASK_DESCRIPTION`.
IMPORTANT: ** The subtasks: ['DONE_TASKS'] have already been done. The future subtasks ['FUTURE_TASKS'] will be done in the future by me. You must only perform the current subtask: `SUBTASK_DESCRIPTION`. Do not try to do future subtasks. **
You are working in CURRENT_OS. You must only complete the subtask provided and not the larger goal.
Expand All @@ -16,8 +15,7 @@ def construct_worker_procedural_memory(agent_class):
3. The history of your previous interactions with the UI.
4. Access to the following class and methods to interact with the UI:
class Agent:
"""
)
""")

for attr_name in dir(agent_class):
attr = getattr(agent_class, attr_name)
Expand All @@ -29,8 +27,7 @@ def {attr_name}{signature}:
'''{attr.__doc__}'''
"""

procedural_memory += textwrap.dedent(
"""
procedural_memory += textwrap.dedent("""
Your response should be formatted like this:
(Previous action verification)
Carefully analyze based on the screenshot and the accessibility tree if the previous action was successful. If the previous action was not successful, provide a reason for the failure.
Expand All @@ -57,8 +54,7 @@ def {attr_name}{signature}:
7. Do not do anything other than the exact specified task. Return with `agent.done()` immediately after the task is completed or `agent.fail()` if it cannot be completed.
8. Whenever possible use hot-keys or typing rather than mouse clicks.
9. My computer's password is 'password', feel free to use it when you need sudo rights
"""
)
""")
return procedural_memory.strip()

# MANAGER_PROMPT = """You are a planning agent for solving GUI navigation tasks. You will be provided the initial configuration of a system including accessibility, screenshot and other information. You need to solve the following task: TASK_DESCRIPTION. You will describe in as much detail as possible the steps required to complete the task by a GUI agent. Please do not include any verification steps in your plan that is not your responsibility. IMPORTANT: Your plan should be as concize as possible and should not include any unnecessary steps. Do not fine-tune, or embellish anything or cause any side effects. Generate the plan that can be accomplished in the shortest time. Please take the current state into account when generating the plan. Please provide the plan in a step-by-step format and make sure you do not include anything that's already done in the GUI in your plan."""
Expand Down
6 changes: 2 additions & 4 deletions gui_agents/s2/agents/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,11 @@ def generate_next_action(
if self.enable_reflection:
# Load the initial subtask info
if self.turn_count == 0:
text_content = textwrap.dedent(
f"""
text_content = textwrap.dedent(f"""
Subtask Description: {subtask}
Subtask Information: {subtask_info}
Current Trajectory below:
"""
)
""")
updated_sys_prompt = (
self.reflection_agent.system_prompt + "\n" + text_content
)
Expand Down
36 changes: 12 additions & 24 deletions gui_agents/s2/memory/procedural_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ class PROCEDURAL_MEMORY:

@staticmethod
def construct_worker_procedural_memory(agent_class, skipped_actions):
procedural_memory = textwrap.dedent(
f"""\
procedural_memory = textwrap.dedent(f"""\
You are an expert in graphical user interfaces and Python code. You are responsible for executing the current subtask: `SUBTASK_DESCRIPTION` of the larger goal: `TASK_DESCRIPTION`.
IMPORTANT: ** The subtasks: ['DONE_TASKS'] have already been done. The future subtasks ['FUTURE_TASKS'] will be done in the future by me. You must only perform the current subtask: `SUBTASK_DESCRIPTION`. Do not try to do future subtasks. **
You are working in CURRENT_OS. You must only complete the subtask provided and not the larger goal.
Expand All @@ -16,8 +15,7 @@ def construct_worker_procedural_memory(agent_class, skipped_actions):
2. The history of your previous interactions with the UI.
3. Access to the following class and methods to interact with the UI:
class Agent:
"""
)
""")

for attr_name in dir(agent_class):
if attr_name in skipped_actions:
Expand All @@ -32,8 +30,7 @@ def {attr_name}{signature}:
'''{attr.__doc__}'''
"""

procedural_memory += textwrap.dedent(
"""
procedural_memory += textwrap.dedent("""
Your response should be formatted like this:
(Previous action verification)
Carefully analyze based on the screenshot if the previous action was successful. If the previous action was not successful, provide a reason for the failure.
Expand All @@ -60,14 +57,12 @@ def {attr_name}{signature}:
8. Whenever possible, your grounded action should use hot-keys with the agent.hotkey() action instead of clicking or dragging.
9. My computer's password is 'password', feel free to use it when you need sudo rights.
10. Do not use the "command" + "tab" hotkey on MacOS.
"""
)
""")

return procedural_memory.strip()

# Manager prompt that generalizes to initial planning, re-planning after subtask completion, and re-planning after failure
COMBINED_MANAGER_PROMPT = textwrap.dedent(
"""
COMBINED_MANAGER_PROMPT = textwrap.dedent("""
You are an expert planning agent for solving GUI navigation tasks. You need to generate a plan for solving the following task: TASK_DESCRIPTION.

You are provided with:
Expand All @@ -91,8 +86,7 @@ def {attr_name}{signature}:
- If you feel the trajectory and future subtasks seem correct based on the current state of the desktop, you may re-use future subtasks.
- If you feel some future subtasks are not detailed enough, use your observations from the desktop screenshot to update these subtasks to be more detailed.
- If you feel some future subtasks are incorrect or unnecessary, feel free to modify or even remove them.
"""
)
""")

# USED IN OSWORLD EXPERIMENTS
RAG_AGENT_OSWORLD = """
Expand All @@ -107,8 +101,7 @@ def {attr_name}{signature}:
"""

# For reflection agent, post-action verification mainly for cycle detection
REFLECTION_ON_TRAJECTORY = textwrap.dedent(
"""
REFLECTION_ON_TRAJECTORY = textwrap.dedent("""
You are a reflection agent designed to assist in subtask execution by reflecting on the trajectory of a subtask and providing feedback for what the next step should be.
You have access to the Subtask Description and the Current Trajectory of another computer agent. The Current Trajectory is a sequence of a desktop image, chain-of-thought reasoning, and a desktop action for each time step. The last image is the screen's display after the last action.
Your task is to generate a reflection. Your generated reflection must fall under one of the two cases listed below:
Expand All @@ -120,8 +113,7 @@ def {attr_name}{signature}:
- DO NOT suggest any specific future plans or actions. Your only goal is to provide a reflection, not an actual plan or action.
- Any response that falls under Case 1 should explain why the trajectory is not going according to plan. You should especially lookout for cycles of actions that are continually repeated with no progress.
- Any response that falls under Case 2 should be concise, since you just need to affirm the agent to continue with the current trajectory.
"""
)
""")

TASK_SUMMARIZATION_PROMPT = """
You are a summarization agent designed to analyze a trajectory of desktop task execution.
Expand Down Expand Up @@ -178,8 +170,7 @@ def {attr_name}{signature}:
Analyze the given plan and provide the output in this JSON format within the <json></json> tags. Ensure the JSON is valid and properly escaped.
"""

SUBTASK_SUMMARIZATION_PROMPT = textwrap.dedent(
"""
SUBTASK_SUMMARIZATION_PROMPT = textwrap.dedent("""
You are a summarization agent designed to analyze a trajectory of desktop task execution.
You will summarize the correct plan and grounded actions based on the whole trajectory of a subtask, ensuring the summarized plan contains only correct and necessary steps.

Expand All @@ -195,8 +186,7 @@ def {attr_name}{signature}:
Action: [Description of the correct action]
Grounded Action: [Grounded actions with the \"element1_description\" replacement when needed]
5. Exclude any other details that are not necessary for completing the task.
"""
)
""")

STATE_EVALUATOR_SYSTEM_PROMPT = """
You are an impartial evaluator to evaluate the completeness of the given desktop computer task, you are also an expert of accessibility tree, os environment and python programming.
Expand Down Expand Up @@ -242,8 +232,7 @@ def {attr_name}{signature}:
Only say Yes or No in the Judgment section. Do not provide any other information in the Judgment section.
"""

PHRASE_TO_WORD_COORDS_PROMPT = textwrap.dedent(
"""
PHRASE_TO_WORD_COORDS_PROMPT = textwrap.dedent("""
You are an expert in graphical user interfaces. Your task is to process a phrase of text, and identify the most relevant word on the computer screen.
You are provided with a phrase, a table with all the text on the screen, and a screenshot of the computer screen. You will identify the single word id that is best associated with the provided phrase.
This single word must be displayed on the computer screenshot, and its location on the screen should align with the provided phrase.
Expand All @@ -254,5 +243,4 @@ def {attr_name}{signature}:
2. Then, output the unique word id. Remember, the word id is the 1st number in each row of the text table.
3. If there are multiple occurrences of the same word, use the surrounding context in the phrase to choose the correct one. Pay very close attention to punctuation and capitalization.

"""
)
""")
6 changes: 2 additions & 4 deletions gui_agents/s2_5/agents/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,10 @@ def generate_next_action(
if self.enable_reflection:
# Load the initial message
if self.turn_count == 0:
text_content = textwrap.dedent(
f"""
text_content = textwrap.dedent(f"""
Task Description: {instruction}
Current Trajectory below:
"""
)
""")
updated_sys_prompt = (
self.reflection_agent.system_prompt + "\n" + text_content
)
Expand Down
24 changes: 8 additions & 16 deletions gui_agents/s2_5/memory/procedural_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
class PROCEDURAL_MEMORY:
@staticmethod
def construct_simple_worker_procedural_memory(agent_class, skipped_actions):
procedural_memory = textwrap.dedent(
f"""\
procedural_memory = textwrap.dedent(f"""\
You are an expert in graphical user interfaces and Python code. You are responsible for executing the task: `TASK_DESCRIPTION`.
You are working in CURRENT_OS.
You are provided with:
1. A screenshot of the current time step.
2. The history of your previous interactions with the UI.
3. Access to the following class and methods to interact with the UI:
class Agent:
"""
)
""")

for attr_name in dir(agent_class):
if attr_name in skipped_actions:
Expand All @@ -30,8 +28,7 @@ def {attr_name}{signature}:
'''{attr.__doc__}'''
"""

procedural_memory += textwrap.dedent(
"""
procedural_memory += textwrap.dedent("""
Your response should be formatted like this:
(Previous action verification)
Carefully analyze based on the screenshot if the previous action was successful. If the previous action was not successful, provide a reason for the failure.
Expand All @@ -58,14 +55,12 @@ def {attr_name}{signature}:
8. Generate agent.fail() as your grounded action if you get exhaustively stuck on the task and believe it is impossible.
9. Generate agent.done() as your grounded action when your believe the task is fully complete.
10. Do not use the "command" + "tab" hotkey on MacOS.
"""
)
""")

return procedural_memory.strip()

# For reflection agent, post-action verification mainly for cycle detection
REFLECTION_ON_TRAJECTORY = textwrap.dedent(
"""
REFLECTION_ON_TRAJECTORY = textwrap.dedent("""
You are an expert computer use agent designed to reflect on the trajectory of a task and provide feedback on what has happened so far.
You have access to the Task Description and the Current Trajectory of another computer agent. The Current Trajectory is a sequence of a desktop image, chain-of-thought reasoning, and a desktop action for each time step. The last image is the screen's display after the last action.
Your task is to generate a reflection. Your generated reflection must fall under one of the cases listed below:
Expand All @@ -79,11 +74,9 @@ def {attr_name}{signature}:
- DO NOT suggest any specific future plans or actions. Your only goal is to provide a reflection, not an actual plan or action.
- Any response that falls under Case 1 should explain why the trajectory is not going according to plan. You should especially lookout for cycles of actions that are continually repeated with no progress.
- Any response that falls under Case 2 should be concise, since you just need to affirm the agent to continue with the current trajectory.
"""
)
""")

PHRASE_TO_WORD_COORDS_PROMPT = textwrap.dedent(
"""
PHRASE_TO_WORD_COORDS_PROMPT = textwrap.dedent("""
You are an expert in graphical user interfaces. Your task is to process a phrase of text, and identify the most relevant word on the computer screen.
You are provided with a phrase, a table with all the text on the screen, and a screenshot of the computer screen. You will identify the single word id that is best associated with the provided phrase.
This single word must be displayed on the computer screenshot, and its location on the screen should align with the provided phrase.
Expand All @@ -94,5 +87,4 @@ def {attr_name}{signature}:
2. Then, output the unique word id. Remember, the word id is the 1st number in each row of the text table.
3. If there are multiple occurrences of the same word, use the surrounding context in the phrase to choose the correct one. Pay very close attention to punctuation and capitalization.

"""
)
""")
6 changes: 2 additions & 4 deletions gui_agents/s3/agents/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,10 @@ def _generate_reflection(self, instruction: str, obs: Dict) -> Tuple[str, str]:
if self.enable_reflection:
# Load the initial message
if self.turn_count == 0:
text_content = textwrap.dedent(
f"""
text_content = textwrap.dedent(f"""
Task Description: {instruction}
Current Trajectory below:
"""
)
""")
updated_sys_prompt = (
self.reflection_agent.system_prompt + "\n" + text_content
)
Expand Down
4 changes: 1 addition & 3 deletions gui_agents/s3/bbon/behavior_narrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@ def place_text(label, color, x, y):
offset_x = 5
if y + offset_y < 0: # Out of bounds on top
offset_y = 5
draw.text(
(x + offset_x, y + offset_y), label, fill=color, font=font
)
draw.text((x + offset_x, y + offset_y), label, fill=color, font=font)

if mouse_action.startswith("pyautogui.click"):
draw.circle((width, height), radius=3, fill=(255, 0, 0))
Expand Down
Loading
Loading