Browser and Computer Operations
Introduction
Enabling agents to operate browsers and computers like humans is an important extension of agent capabilities. From browser automation to full desktop operation, advances in Vision-Language Models (VLMs) have made GUI agents a reality.
Browser Automation
Traditional Approach: DOM Manipulation
Automation based on HTML DOM structure:
# Playwright example
from playwright.async_api import async_playwright
async def browser_automation():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Navigate
await page.goto("https://example.com")
# Click
await page.click("button#submit")
# Fill form
await page.fill("input[name='search']", "AI Agent")
# Extract content
content = await page.text_content("div.result")
# Screenshot
await page.screenshot(path="screenshot.png")
await browser.close()
return content
Agent-Driven Browser Operations
Wrapping browser operations as agent tools:
class BrowserTools:
def __init__(self):
self.page = None
async def navigate(self, url: str) -> str:
"""Navigate to a specified URL"""
await self.page.goto(url, wait_until="networkidle")
return f"Navigated to {url}"
async def click(self, selector: str) -> str:
"""Click a page element"""
await self.page.click(selector)
return f"Clicked {selector}"
async def type_text(self, selector: str, text: str) -> str:
"""Type text into an input field"""
await self.page.fill(selector, text)
return f"Typed '{text}' into {selector}"
async def get_text(self, selector: str) -> str:
"""Get element text"""
return await self.page.text_content(selector)
async def screenshot(self) -> str:
"""Take a screenshot of the current page"""
screenshot = await self.page.screenshot()
return screenshot # Return image data
async def get_page_content(self) -> str:
"""Get the page's accessibility tree"""
accessibility_tree = await self.page.accessibility.snapshot()
return format_accessibility_tree(accessibility_tree)
GUI Agent Approaches
Approach 1: Screenshot-to-Action
Using VLMs to directly understand interfaces from screenshots and decide on actions:
Screenshot → VLM Analysis → Decide Action (click coordinates / type text) → Execute → New Screenshot → ...
class ScreenshotAgent:
def __init__(self, vlm):
self.vlm = vlm
async def step(self, task, screenshot):
"""Decide the next action based on a screenshot"""
response = self.vlm.generate(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": f"Task: {task}\nAnalyze the screenshot and decide the next action."},
{"type": "image", "source": screenshot}
]
}
],
tools=[
{"name": "click", "parameters": {"x": "int", "y": "int"}},
{"name": "type", "parameters": {"text": "string"}},
{"name": "scroll", "parameters": {"direction": "up|down"}},
{"name": "done", "parameters": {"result": "string"}},
]
)
return response.tool_calls
Approach 2: DOM-to-Action
Parse the HTML DOM structure and let the LLM decide actions based on a textual representation:
async def get_simplified_dom(page):
"""Get a simplified DOM representation"""
elements = await page.evaluate("""
() => {
const interactable = document.querySelectorAll(
'a, button, input, select, textarea, [role="button"], [onclick]'
);
return Array.from(interactable).map((el, idx) => ({
id: idx,
tag: el.tagName.toLowerCase(),
text: el.textContent?.trim().substring(0, 100),
type: el.type || '',
href: el.href || '',
placeholder: el.placeholder || '',
}));
}
""")
# Format into LLM-readable text
dom_text = "Interactable elements:\n"
for el in elements:
dom_text += f"[{el['id']}] <{el['tag']}> {el['text']}"
if el['type']:
dom_text += f" (type={el['type']})"
if el['href']:
dom_text += f" (href={el['href']})"
dom_text += "\n"
return dom_text
Approach 3: Accessibility Tree
Using the accessibility tree as a structured representation of the page:
async def get_accessibility_tree(page):
"""Get the page's accessibility tree"""
tree = await page.accessibility.snapshot()
def format_node(node, depth=0):
indent = " " * depth
text = f"{indent}{node['role']}"
if node.get('name'):
text += f': "{node["name"]}"'
if node.get('value'):
text += f' (value: {node["value"]})'
text += "\n"
for child in node.get('children', []):
text += format_node(child, depth + 1)
return text
return format_node(tree)
# Example output:
# WebArea: "Search - Google"
# navigation: "Navigation"
# link: "Gmail"
# link: "Images"
# search: "Search"
# textbox: "Search" (value: "")
# button: "Google Search"
Anthropic Computer Use
Overview
Anthropic's Computer Use enables Claude to operate a computer like a human: view the screen, move the mouse, click buttons, and type text.
Tool Definitions
# Anthropic Computer Use provides three built-in tools
computer_tools = [
{
"type": "computer_20241022",
"name": "computer",
"display_width_px": 1024,
"display_height_px": 768,
},
{
"type": "text_editor_20241022",
"name": "str_replace_editor",
},
{
"type": "bash_20241022",
"name": "bash",
}
]
Usage Example
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
tools=computer_tools,
messages=[{
"role": "user",
"content": "Open Firefox browser and search for today's weather"
}]
)
# Claude returns tool calls such as:
# 1. computer(action="screenshot") - First take a screenshot to see current state
# 2. computer(action="mouse_move", coordinate=[500, 400]) - Move mouse
# 3. computer(action="left_click") - Click
# 4. computer(action="type", text="weather") - Type text
Action Space
| Action | Description | Parameters |
|---|---|---|
screenshot |
Capture screen | None |
mouse_move |
Move mouse | coordinate: [x, y] |
left_click |
Left click | coordinate: [x, y] |
right_click |
Right click | coordinate: [x, y] |
double_click |
Double click | coordinate: [x, y] |
type |
Type text | text: string |
key |
Press key | key: string (e.g., "Return", "ctrl+c") |
scroll |
Scroll | coordinate, direction |
drag |
Drag | start_coordinate, end_coordinate |
WebVoyager and Web Agents
WebVoyager (He et al., 2024)
Uses VLMs to complete complex tasks on real websites:
Task: "Search for the highest-rated noise-canceling headphones on Amazon under $100"
Agent execution flow:
1. Screenshot → Identify search box → Type "noise canceling headphones"
2. Screenshot → Identify filters → Click price filter
3. Screenshot → Set price range → Click apply
4. Screenshot → Identify sort option → Sort by rating
5. Screenshot → Extract results → Return recommendations
Web Agent Architecture
graph TB
TASK[User Task] --> PLAN[Task Planning]
PLAN --> LOOP{Agent Loop}
LOOP --> OBS[Observe<br/>Screenshot/DOM/A11y]
OBS --> THINK[Think<br/>Analyze Current State]
THINK --> ACT[Act<br/>Click/Type/Scroll]
ACT --> ENV[Browser Environment]
ENV --> OBS
THINK -->|Task Complete| RESULT[Return Result]
Screen Understanding and VLMs
Set-of-Mark (SoM)
Annotate interactable elements with numbered labels on screenshots to help VLMs locate them precisely:
def add_set_of_mark(screenshot, elements):
"""Annotate element numbers on a screenshot"""
from PIL import Image, ImageDraw, ImageFont
img = Image.open(screenshot)
draw = ImageDraw.Draw(img)
for idx, el in enumerate(elements):
x, y, w, h = el["bbox"]
# Draw bounding box
draw.rectangle([x, y, x+w, y+h], outline="red", width=2)
# Label number
draw.text((x, y-15), str(idx), fill="red")
return img
Multimodal Input
# Combining screenshots with structured information
prompt = f"""
You are operating a browser to complete the task: {task}
See the current page screenshot in the image.
Current URL: {current_url}
Interactable elements:
{accessibility_tree}
Choose the next action.
"""
Security Considerations
Risks
- Sensitive information leakage: Screen may contain passwords and personal information
- Misoperations: Agent may click wrong buttons (e.g., delete, pay)
- Malicious websites: Agent may be deceived by phishing sites
- Excessive permissions: OS-level access privileges
Safety Measures
class SafeBrowserAgent:
BLOCKED_ACTIONS = [
"confirm payment", "delete account", "send email", # High-risk actions
]
BLOCKED_DOMAINS = [
"bank.com", "payment.com", # Financial websites
]
async def safe_execute(self, action, context):
"""Execute after safety checks"""
# 1. Domain check
if any(d in context["url"] for d in self.BLOCKED_DOMAINS):
return "Denied: automated operations not allowed on financial websites"
# 2. High-risk action check
if any(a in str(action) for a in self.BLOCKED_ACTIONS):
# Require human confirmation
confirmed = await self.request_human_confirmation(action)
if not confirmed:
return "User cancelled the operation"
# 3. Execute
return await self.execute(action)
Further Reading
- Web Agents - Specific applications of web agents
- He, H., et al. (2024). "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models"
- Anthropic. "Computer Use" Documentation
- Zheng, B., et al. (2024). "GPT-4V(ision) is a Generalist Web Agent, if Grounded"