Skip to content

Browser and Computer Operations

Introduction

Enabling agents to operate browsers and computers like humans is an important extension of agent capabilities. From browser automation to full desktop operation, advances in Vision-Language Models (VLMs) have made GUI agents a reality.

Browser Automation

Traditional Approach: DOM Manipulation

Automation based on HTML DOM structure:

# Playwright example
from playwright.async_api import async_playwright

async def browser_automation():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # Navigate
        await page.goto("https://example.com")

        # Click
        await page.click("button#submit")

        # Fill form
        await page.fill("input[name='search']", "AI Agent")

        # Extract content
        content = await page.text_content("div.result")

        # Screenshot
        await page.screenshot(path="screenshot.png")

        await browser.close()
        return content

Agent-Driven Browser Operations

Wrapping browser operations as agent tools:

class BrowserTools:
    def __init__(self):
        self.page = None

    async def navigate(self, url: str) -> str:
        """Navigate to a specified URL"""
        await self.page.goto(url, wait_until="networkidle")
        return f"Navigated to {url}"

    async def click(self, selector: str) -> str:
        """Click a page element"""
        await self.page.click(selector)
        return f"Clicked {selector}"

    async def type_text(self, selector: str, text: str) -> str:
        """Type text into an input field"""
        await self.page.fill(selector, text)
        return f"Typed '{text}' into {selector}"

    async def get_text(self, selector: str) -> str:
        """Get element text"""
        return await self.page.text_content(selector)

    async def screenshot(self) -> str:
        """Take a screenshot of the current page"""
        screenshot = await self.page.screenshot()
        return screenshot  # Return image data

    async def get_page_content(self) -> str:
        """Get the page's accessibility tree"""
        accessibility_tree = await self.page.accessibility.snapshot()
        return format_accessibility_tree(accessibility_tree)

GUI Agent Approaches

Approach 1: Screenshot-to-Action

Using VLMs to directly understand interfaces from screenshots and decide on actions:

Screenshot → VLM Analysis → Decide Action (click coordinates / type text) → Execute → New Screenshot → ...
class ScreenshotAgent:
    def __init__(self, vlm):
        self.vlm = vlm

    async def step(self, task, screenshot):
        """Decide the next action based on a screenshot"""
        response = self.vlm.generate(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": f"Task: {task}\nAnalyze the screenshot and decide the next action."},
                        {"type": "image", "source": screenshot}
                    ]
                }
            ],
            tools=[
                {"name": "click", "parameters": {"x": "int", "y": "int"}},
                {"name": "type", "parameters": {"text": "string"}},
                {"name": "scroll", "parameters": {"direction": "up|down"}},
                {"name": "done", "parameters": {"result": "string"}},
            ]
        )
        return response.tool_calls

Approach 2: DOM-to-Action

Parse the HTML DOM structure and let the LLM decide actions based on a textual representation:

async def get_simplified_dom(page):
    """Get a simplified DOM representation"""
    elements = await page.evaluate("""
        () => {
            const interactable = document.querySelectorAll(
                'a, button, input, select, textarea, [role="button"], [onclick]'
            );
            return Array.from(interactable).map((el, idx) => ({
                id: idx,
                tag: el.tagName.toLowerCase(),
                text: el.textContent?.trim().substring(0, 100),
                type: el.type || '',
                href: el.href || '',
                placeholder: el.placeholder || '',
            }));
        }
    """)

    # Format into LLM-readable text
    dom_text = "Interactable elements:\n"
    for el in elements:
        dom_text += f"[{el['id']}] <{el['tag']}> {el['text']}"
        if el['type']:
            dom_text += f" (type={el['type']})"
        if el['href']:
            dom_text += f" (href={el['href']})"
        dom_text += "\n"

    return dom_text

Approach 3: Accessibility Tree

Using the accessibility tree as a structured representation of the page:

async def get_accessibility_tree(page):
    """Get the page's accessibility tree"""
    tree = await page.accessibility.snapshot()

    def format_node(node, depth=0):
        indent = "  " * depth
        text = f"{indent}{node['role']}"
        if node.get('name'):
            text += f': "{node["name"]}"'
        if node.get('value'):
            text += f' (value: {node["value"]})'
        text += "\n"

        for child in node.get('children', []):
            text += format_node(child, depth + 1)
        return text

    return format_node(tree)

# Example output:
# WebArea: "Search - Google"
#   navigation: "Navigation"
#     link: "Gmail"
#     link: "Images"
#   search: "Search"
#     textbox: "Search" (value: "")
#     button: "Google Search"

Anthropic Computer Use

Overview

Anthropic's Computer Use enables Claude to operate a computer like a human: view the screen, move the mouse, click buttons, and type text.

Tool Definitions

# Anthropic Computer Use provides three built-in tools
computer_tools = [
    {
        "type": "computer_20241022",
        "name": "computer",
        "display_width_px": 1024,
        "display_height_px": 768,
    },
    {
        "type": "text_editor_20241022",
        "name": "str_replace_editor",
    },
    {
        "type": "bash_20241022",
        "name": "bash",
    }
]

Usage Example

import anthropic

client = anthropic.Anthropic()

response = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    tools=computer_tools,
    messages=[{
        "role": "user",
        "content": "Open Firefox browser and search for today's weather"
    }]
)

# Claude returns tool calls such as:
# 1. computer(action="screenshot") - First take a screenshot to see current state
# 2. computer(action="mouse_move", coordinate=[500, 400]) - Move mouse
# 3. computer(action="left_click") - Click
# 4. computer(action="type", text="weather") - Type text

Action Space

Action Description Parameters
screenshot Capture screen None
mouse_move Move mouse coordinate: [x, y]
left_click Left click coordinate: [x, y]
right_click Right click coordinate: [x, y]
double_click Double click coordinate: [x, y]
type Type text text: string
key Press key key: string (e.g., "Return", "ctrl+c")
scroll Scroll coordinate, direction
drag Drag start_coordinate, end_coordinate

WebVoyager and Web Agents

WebVoyager (He et al., 2024)

Uses VLMs to complete complex tasks on real websites:

Task: "Search for the highest-rated noise-canceling headphones on Amazon under $100"

Agent execution flow:
1. Screenshot → Identify search box → Type "noise canceling headphones"
2. Screenshot → Identify filters → Click price filter
3. Screenshot → Set price range → Click apply
4. Screenshot → Identify sort option → Sort by rating
5. Screenshot → Extract results → Return recommendations

Web Agent Architecture

graph TB
    TASK[User Task] --> PLAN[Task Planning]
    PLAN --> LOOP{Agent Loop}

    LOOP --> OBS[Observe<br/>Screenshot/DOM/A11y]
    OBS --> THINK[Think<br/>Analyze Current State]
    THINK --> ACT[Act<br/>Click/Type/Scroll]
    ACT --> ENV[Browser Environment]
    ENV --> OBS

    THINK -->|Task Complete| RESULT[Return Result]

Screen Understanding and VLMs

Set-of-Mark (SoM)

Annotate interactable elements with numbered labels on screenshots to help VLMs locate them precisely:

def add_set_of_mark(screenshot, elements):
    """Annotate element numbers on a screenshot"""
    from PIL import Image, ImageDraw, ImageFont

    img = Image.open(screenshot)
    draw = ImageDraw.Draw(img)

    for idx, el in enumerate(elements):
        x, y, w, h = el["bbox"]
        # Draw bounding box
        draw.rectangle([x, y, x+w, y+h], outline="red", width=2)
        # Label number
        draw.text((x, y-15), str(idx), fill="red")

    return img

Multimodal Input

# Combining screenshots with structured information
prompt = f"""
You are operating a browser to complete the task: {task}

See the current page screenshot in the image.
Current URL: {current_url}
Interactable elements:
{accessibility_tree}

Choose the next action.
"""

Security Considerations

Risks

  1. Sensitive information leakage: Screen may contain passwords and personal information
  2. Misoperations: Agent may click wrong buttons (e.g., delete, pay)
  3. Malicious websites: Agent may be deceived by phishing sites
  4. Excessive permissions: OS-level access privileges

Safety Measures

class SafeBrowserAgent:
    BLOCKED_ACTIONS = [
        "confirm payment", "delete account", "send email",  # High-risk actions
    ]

    BLOCKED_DOMAINS = [
        "bank.com", "payment.com",  # Financial websites
    ]

    async def safe_execute(self, action, context):
        """Execute after safety checks"""
        # 1. Domain check
        if any(d in context["url"] for d in self.BLOCKED_DOMAINS):
            return "Denied: automated operations not allowed on financial websites"

        # 2. High-risk action check
        if any(a in str(action) for a in self.BLOCKED_ACTIONS):
            # Require human confirmation
            confirmed = await self.request_human_confirmation(action)
            if not confirmed:
                return "User cancelled the operation"

        # 3. Execute
        return await self.execute(action)

Further Reading

  • Web Agents - Specific applications of web agents
  • He, H., et al. (2024). "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models"
  • Anthropic. "Computer Use" Documentation
  • Zheng, B., et al. (2024). "GPT-4V(ision) is a Generalist Web Agent, if Grounded"

评论 #