Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.
This commit is contained in:
teknium 2026-01-29 06:10:24 +00:00
parent 54ca0997ee
commit 248acf715e
12 changed files with 2626 additions and 134 deletions

View File

@ -2,14 +2,15 @@
# Copy this file to .env and fill in your API keys
# =============================================================================
# LLM PROVIDER (OpenRouter - Primary)
# LLM PROVIDER (OpenRouter)
# =============================================================================
# OpenRouter provides access to many models through one API
# Get at: https://openrouter.ai/keys
# All LLM calls go through OpenRouter - no direct provider keys needed
# Get your key at: https://openrouter.ai/keys
OPENROUTER_API_KEY=
# Default model to use (OpenRouter format: provider/model)
# Examples: anthropic/claude-sonnet-4, openai/gpt-4o, google/gemini-2.0-flash
# Examples: anthropic/claude-sonnet-4, openai/gpt-4o, google/gemini-2.0-flash, zhipuai/glm-4-plus
LLM_MODEL=anthropic/claude-sonnet-4
# =============================================================================
@ -31,14 +32,17 @@ FAL_KEY=
# =============================================================================
# TERMINAL TOOL CONFIGURATION (mini-swe-agent backend)
# =============================================================================
# Backend type: "local", "docker", or "modal"
# Backend type: "local", "singularity", "docker", or "modal"
# - local: Runs directly on your machine (fastest, no isolation)
# - docker: Runs in Docker containers (isolated, requires Docker installed)
# - singularity: Runs in Apptainer/Singularity containers (HPC clusters, no root needed)
# - docker: Runs in Docker containers (isolated, requires Docker + docker group)
# - modal: Runs in Modal cloud sandboxes (scalable, requires Modal account)
TERMINAL_ENV=docker
TERMINAL_ENV=singularity
# Docker image to use (for docker and modal backends)
TERMINAL_DOCKER_IMAGE=python:3.11-slim
# Container images (for singularity/docker/modal backends)
TERMINAL_DOCKER_IMAGE=python:3.11
TERMINAL_SINGULARITY_IMAGE=docker://python:3.11
TERMINAL_MODAL_IMAGE=python:3.11
# Working directory inside the container
TERMINAL_CWD=/tmp
@ -57,6 +61,73 @@ TERMINAL_LIFETIME_SECONDS=300
# This will authenticate via browser and store credentials locally.
# No API key needed in .env - Modal handles auth automatically.
# =============================================================================
# BROWSER TOOL CONFIGURATION (agent-browser + Browserbase)
# =============================================================================
# Browser automation requires Browserbase cloud service for remote browser execution.
# This allows the agent to navigate websites, fill forms, and extract information.
#
# STEALTH MODES:
# - Basic Stealth: ALWAYS active (random fingerprints, auto CAPTCHA solving)
# - Advanced Stealth: Requires BROWSERBASE_ADVANCED_STEALTH=true (Scale Plan only)
# Browserbase API Key - Cloud browser execution
# Get at: https://browserbase.com/
BROWSERBASE_API_KEY=
# Browserbase Project ID - From your Browserbase dashboard
BROWSERBASE_PROJECT_ID=
# Enable residential proxies for better CAPTCHA solving (default: true)
# Routes traffic through residential IPs, significantly improves success rate
BROWSERBASE_PROXIES=true
# Enable advanced stealth mode (default: false, requires Scale Plan)
# Uses custom Chromium build to avoid bot detection altogether
BROWSERBASE_ADVANCED_STEALTH=false
# Browser session timeout in seconds (optional, default: 300)
# Sessions are cleaned up after this duration of inactivity
BROWSER_SESSION_TIMEOUT=300
# =============================================================================
# Browser automation requires Browserbase cloud service for remote browser execution.
# This allows the agent to navigate websites, fill forms, and extract information.
# Browserbase API Key - Cloud browser execution
# Get at: https://browserbase.com/
BROWSERBASE_API_KEY=
# Browserbase Project ID - From your Browserbase dashboard
BROWSERBASE_PROJECT_ID=
# Enable proxies for better CAPTCHA solving and anti-bot avoidance (default: true)
# Proxies route traffic through residential IPs for more reliable access
BROWSERBASE_PROXIES=true
# Enable advanced stealth mode (default: false, requires Scale Plan)
# Uses custom Chromium build to avoid bot detection altogether
BROWSERBASE_ADVANCED_STEALTH=false
# Browser session timeout in seconds (optional, default: 300)
# Sessions are cleaned up after this duration of inactivity
BROWSER_SESSION_TIMEOUT=300
# =============================================================================
# Browser automation requires Browserbase cloud service for remote browser execution.
# This allows the agent to navigate websites, fill forms, and extract information.
# Browserbase API Key - Cloud browser execution
# Get at: https://browserbase.com/
BROWSERBASE_API_KEY=
# Browserbase Project ID - From your Browserbase dashboard
BROWSERBASE_PROJECT_ID=
# Browser session timeout in seconds (optional, default: 300)
# Sessions are cleaned up after this duration of inactivity
BROWSER_SESSION_TIMEOUT=300
# =============================================================================
# LEGACY/OPTIONAL API KEYS
# =============================================================================
@ -69,10 +140,6 @@ MORPH_API_KEY=
HECATE_VM_LIFETIME_SECONDS=300
HECATE_DEFAULT_SNAPSHOT_ID=snapshot_p5294qxt
# Direct provider keys (optional - OpenRouter is preferred)
ANTHROPIC_API_KEY=
OPENAI_API_KEY=
# =============================================================================
# DEBUG OPTIONS
# =============================================================================
@ -80,3 +147,12 @@ WEB_TOOLS_DEBUG=false
VISION_TOOLS_DEBUG=false
MOA_TOOLS_DEBUG=false
IMAGE_TOOLS_DEBUG=false
# Scratch directory for Singularity sandboxes (optional)
# If not set, uses /scratch (if available) or /tmp
# Set this to a directory with lots of space for large pip installs
# TERMINAL_SCRATCH_DIR=/scratch/myuser
# Disk usage warning threshold in GB (default: 500)
# Warning is printed when total sandbox disk usage exceeds this
TERMINAL_DISK_WARNING_GB=500

View File

@ -6,6 +6,7 @@ An AI agent with advanced tool-calling capabilities, featuring a flexible toolse
- **Web Tools**: Search, extract content, and crawl websites
- **Terminal Tools**: Execute commands via mini-swe-agent (local, Docker, or Modal backends)
- **Browser Tools**: Automate web browsers to navigate, click, type, and extract content
- **Vision Tools**: Analyze images from URLs
- **Reasoning Tools**: Advanced multi-model reasoning (Mixture of Agents)
- **Creative Tools**: Generate images from text prompts
@ -53,9 +54,9 @@ nano .env # or use your preferred editor
- `NOUS_API_KEY` - Vision & reasoning tools (get at: https://inference-api.nousresearch.com/)
- `FAL_KEY` - Image generation (get at: https://fal.ai/)
**Optional API Keys:**
- `ANTHROPIC_API_KEY` - Direct Anthropic access (if not using OpenRouter)
- `OPENAI_API_KEY` - Direct OpenAI access (if not using OpenRouter)
**Optional API Keys (for specific features):**
- `BROWSERBASE_API_KEY` - Browser automation (get at: https://browserbase.com/)
- `BROWSERBASE_PROJECT_ID` - From Browserbase dashboard
- `MORPH_API_KEY` - For legacy Hecate terminal backend (get at: https://morph.so/)
### 4. Configure Terminal Backend
@ -63,19 +64,22 @@ nano .env # or use your preferred editor
The terminal tool uses **mini-swe-agent** environments. Configure in `.env`:
```bash
# Backend: "local" (host machine), "docker" (containers), or "modal" (cloud)
TERMINAL_ENV=local # Default: runs on host machine
TERMINAL_ENV=docker # Recommended: isolated Docker containers
# Backend: "local", "docker", "singularity", or "modal"
TERMINAL_ENV=local # Default: runs on host machine (no isolation)
TERMINAL_ENV=singularity # Recommended for HPC: Apptainer/Singularity containers
TERMINAL_ENV=docker # Isolated Docker containers
TERMINAL_ENV=modal # Cloud execution via Modal
# Docker settings (for docker/modal backends)
# Container image (for docker/singularity/modal backends)
TERMINAL_DOCKER_IMAGE=python:3.11-slim
TERMINAL_SINGULARITY_IMAGE=docker://python:3.11-slim
TERMINAL_TIMEOUT=60
```
**Backend Requirements:**
- **local**: No extra setup (runs directly on your machine)
- **docker**: Requires Docker installed and running. User must be in `docker` group.
- **local**: No extra setup (runs directly on your machine, no isolation)
- **singularity**: Requires Apptainer or Singularity installed (common on HPC clusters, no root needed)
- **docker**: Requires Docker installed and user in `docker` group
- **modal**: Requires Modal account (see setup below)
### Modal Cloud Backend Setup
@ -95,6 +99,55 @@ TERMINAL_ENV=modal
Modal uses CLI-based authentication (stored in `~/.modal/`), so no API key is needed in `.env`. After running `modal setup`, commands will automatically execute in Modal's cloud sandboxes.
### Browser Tools Setup
Browser tools enable the agent to navigate websites, fill forms, click buttons, and extract content. They use [agent-browser](https://github.com/vercel-labs/agent-browser) CLI with [Browserbase](https://browserbase.com) cloud execution.
```bash
# 1. Install Node.js (if not already installed)
# Use nvm (recommended) or your package manager
# 2. Install agent-browser CLI globally
npm install -g agent-browser
# 3. Get Browserbase credentials
# Sign up at https://browserbase.com/ and get your:
# - API Key (from Settings → API Keys)
# - Project ID (from your project dashboard)
# 4. Add to your .env file:
BROWSERBASE_API_KEY=your_api_key_here
BROWSERBASE_PROJECT_ID=your_project_id_here
```
**Available Browser Tools:**
| Tool | Description |
|------|-------------|
| `browser_navigate` | Navigate to a URL |
| `browser_snapshot` | Get text-based page snapshot with element refs |
| `browser_click` | Click an element by ref (e.g., `@e5`) |
| `browser_type` | Type text into an input field |
| `browser_scroll` | Scroll up or down |
| `browser_back` | Go back in browser history |
| `browser_press` | Press a keyboard key (Enter, Tab, etc.) |
| `browser_close` | Close the browser session |
| `browser_get_images` | Get list of images on the page |
**Example Usage:**
```bash
# Use browser tools with web search and vision
python run_agent.py \
--query "Go to amazon.com and find the price of the latest Kindle" \
--enabled_toolsets=browser,web,vision
# Use browser-focused distribution
python batch_runner.py \
--dataset_file=browser_tasks.jsonl \
--distribution=browser_use \
--run_name=browser_run
```
See `.env.example` for all available configuration options including debug settings.
## Toolsets System
@ -267,10 +320,6 @@ All environment variables can be configured in the `.env` file (copy from `.env.
- `NOUS_API_KEY`: Vision and reasoning tools
- `FAL_KEY`: Image generation tools
**Optional Direct Provider Keys:**
- `ANTHROPIC_API_KEY`: Direct Anthropic access (fallback if OpenRouter not set)
- `OPENAI_API_KEY`: Direct OpenAI access (fallback if OpenRouter not set)
**Terminal Tool Configuration (mini-swe-agent backend):**
- `TERMINAL_ENV`: Backend type - `local`, `docker`, or `modal` (default: `local`)
- `TERMINAL_DOCKER_IMAGE`: Docker image to use (default: `python:3.11-slim`)
@ -278,6 +327,11 @@ All environment variables can be configured in the `.env` file (copy from `.env.
- `TERMINAL_LIFETIME_SECONDS`: Cleanup inactive environments after this time (default: `300`)
- `TERMINAL_CWD`: Working directory inside containers (default: `/tmp`)
**Browser Tool Configuration (agent-browser + Browserbase):**
- `BROWSERBASE_API_KEY`: Browserbase API key for cloud browser execution
- `BROWSERBASE_PROJECT_ID`: Browserbase project ID
- `BROWSER_SESSION_TIMEOUT`: Session timeout in seconds (default: `300`)
**Legacy Hecate Terminal Backend (optional):**
- `MORPH_API_KEY`: For Hecate/MorphCloud terminal backend
- `HECATE_VM_LIFETIME_SECONDS`: VM lifetime (default: 300)

View File

@ -49,8 +49,13 @@ _WORKER_CONFIG = {}
# All possible tools - used to ensure consistent schema across all trajectory entries
# This is required because Arrow/Parquet (used by HuggingFace datasets) needs identical schemas
ALL_POSSIBLE_TOOLS = {
'terminal', 'web_search', 'web_extract', 'web_crawl',
'vision_analyze', 'image_generate', 'mixture_of_agents'
'terminal', 'web_search', 'web_extract',
'vision_analyze', 'image_generate', 'mixture_of_agents',
# Browser automation tools
'browser_navigate', 'browser_snapshot', 'browser_click',
'browser_type', 'browser_scroll', 'browser_back',
'browser_press', 'browser_close', 'browser_get_images',
'browser_vision'
}
# Default stats for tools that weren't used
@ -828,8 +833,13 @@ class BatchRunner:
combined_file = self.output_dir / "trajectories.jsonl"
print(f"\n📦 Combining ALL batch files into {combined_file.name}...")
VALID_TOOLS = {'web_search', 'web_extract', 'web_crawl', 'terminal', 'vision_analyze',
'image_generate', 'mixture_of_agents'}
VALID_TOOLS = {'web_search', 'web_extract', 'terminal', 'vision_analyze',
'image_generate', 'mixture_of_agents',
# Browser automation tools
'browser_navigate', 'browser_snapshot', 'browser_click',
'browser_type', 'browser_scroll', 'browser_back',
'browser_press', 'browser_close', 'browser_get_images',
'browser_vision'}
total_entries = 0
filtered_entries = 0
@ -928,9 +938,9 @@ def main(
batch_size: int = None,
run_name: str = None,
distribution: str = "default",
model: str = "claude-opus-4-20250514",
model: str = "anthropic/claude-sonnet-4-20250514",
api_key: str = None,
base_url: str = "https://api.anthropic.com/v1/",
base_url: str = "https://openrouter.ai/api/v1",
max_turns: int = 10,
num_workers: int = 4,
resume: bool = False,

View File

@ -37,6 +37,22 @@ from tools.terminal_hecate import terminal_hecate_tool, check_hecate_requirement
from tools.vision_tools import vision_analyze_tool, check_vision_requirements
from tools.mixture_of_agents_tool import mixture_of_agents_tool, check_moa_requirements
from tools.image_generation_tool import image_generate_tool, check_image_generation_requirements
# Browser automation tools (agent-browser + Browserbase)
from tools.browser_tool import (
browser_navigate,
browser_snapshot,
browser_click,
browser_type,
browser_scroll,
browser_back,
browser_press,
browser_close,
browser_get_images,
browser_vision,
cleanup_browser,
check_browser_requirements,
BROWSER_TOOL_SCHEMAS
)
from toolsets import (
get_toolset, resolve_toolset, resolve_multiple_toolsets,
get_all_toolsets, get_toolset_names, validate_toolset,
@ -55,7 +71,7 @@ def get_web_tool_definitions() -> List[Dict[str, Any]]:
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web for information on any topic. Returns up to 5 relevant results with titles and URLs. Uses advanced search depth for comprehensive results.",
"description": "Search the web for information on any topic. Returns up to 5 relevant results with titles and URLs. Uses advanced search depth for comprehensive results. PREFERRED over browser tools for finding information - faster and more cost-effective. Use browser tools only when you need to interact with pages (click, fill forms, handle dynamic content).",
"parameters": {
"type": "object",
"properties": {
@ -72,7 +88,7 @@ def get_web_tool_definitions() -> List[Dict[str, Any]]:
"type": "function",
"function": {
"name": "web_extract",
"description": "Extract and read the full content from specific web page URLs. Useful for getting detailed information from webpages found through search. The content returned will be excerpts and key points summarized with an LLM to reduce impact on the context window.",
"description": "Extract and read the full content from specific web page URLs. Useful for getting detailed information from webpages found through search. The content returned will be excerpts and key points summarized with an LLM to reduce impact on the context window. PREFERRED over browser tools for reading page content - faster and more cost-effective. Use browser tools only when pages require interaction or have dynamic content.",
"parameters": {
"type": "object",
"properties": {
@ -87,27 +103,6 @@ def get_web_tool_definitions() -> List[Dict[str, Any]]:
}
}
},
{
"type": "function",
"function": {
"name": "web_crawl",
"description": "Crawl a website with specific instructions to find and extract targeted content. Uses AI to intelligently navigate and extract relevant information from across the site. The content returned will be excerpts and key points summarized with an LLM to reduce impact on the context window.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The base URL to crawl (can include or exclude https://)"
},
"instructions": {
"type": "string",
"description": "Specific instructions for what to crawl/extract using AI intelligence (e.g., 'Find pricing information', 'Get documentation pages', 'Extract contact details')"
}
},
"required": ["url"]
}
}
}
]
def get_terminal_tool_definitions() -> List[Dict[str, Any]]:
@ -244,6 +239,18 @@ def get_image_tool_definitions() -> List[Dict[str, Any]]:
]
def get_browser_tool_definitions() -> List[Dict[str, Any]]:
"""
Get tool definitions for browser automation tools in OpenAI's expected format.
Uses agent-browser CLI with Browserbase cloud execution.
Returns:
List[Dict]: List of browser tool definitions compatible with OpenAI API
"""
return [{"type": "function", "function": schema} for schema in BROWSER_TOOL_SCHEMAS]
def get_all_tool_names() -> List[str]:
"""
Get the names of all available tools across all toolsets.
@ -255,7 +262,7 @@ def get_all_tool_names() -> List[str]:
# Web tools
if check_firecrawl_api_key():
tool_names.extend(["web_search", "web_extract", "web_crawl"])
tool_names.extend(["web_search", "web_extract"])
# Terminal tools (mini-swe-agent backend)
if check_terminal_requirements():
@ -273,6 +280,15 @@ def get_all_tool_names() -> List[str]:
if check_image_generation_requirements():
tool_names.extend(["image_generate"])
# Browser automation tools
if check_browser_requirements():
tool_names.extend([
"browser_navigate", "browser_snapshot", "browser_click",
"browser_type", "browser_scroll", "browser_back",
"browser_press", "browser_close", "browser_get_images",
"browser_vision"
])
return tool_names
@ -288,12 +304,22 @@ def get_toolset_for_tool(tool_name: str) -> str:
"""
toolset_mapping = {
"web_search": "web_tools",
"web_extract": "web_tools",
"web_crawl": "web_tools",
"web_extract": "web_tools",
"terminal": "terminal_tools",
"vision_analyze": "vision_tools",
"mixture_of_agents": "moa_tools",
"image_generate": "image_tools"
"image_generate": "image_tools",
# Browser automation tools
"browser_navigate": "browser_tools",
"browser_snapshot": "browser_tools",
"browser_click": "browser_tools",
"browser_type": "browser_tools",
"browser_scroll": "browser_tools",
"browser_back": "browser_tools",
"browser_press": "browser_tools",
"browser_close": "browser_tools",
"browser_get_images": "browser_tools",
"browser_vision": "browser_tools"
}
return toolset_mapping.get(tool_name, "unknown")
@ -357,6 +383,10 @@ def get_tool_definitions(
for tool in get_image_tool_definitions():
all_available_tools_map[tool["function"]["name"]] = tool
if check_browser_requirements():
for tool in get_browser_tool_definitions():
all_available_tools_map[tool["function"]["name"]] = tool
# Determine which tools to include based on toolsets
tools_to_include = set()
@ -369,14 +399,20 @@ def get_tool_definitions(
print(f"✅ Enabled toolset '{toolset_name}': {', '.join(resolved_tools) if resolved_tools else 'no tools'}")
else:
# Try legacy compatibility
if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools"]:
if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "browser_tools"]:
# Map legacy names to new system
legacy_map = {
"web_tools": ["web_search", "web_extract", "web_crawl"],
"web_tools": ["web_search", "web_extract"],
"terminal_tools": ["terminal"],
"vision_tools": ["vision_analyze"],
"moa_tools": ["mixture_of_agents"],
"image_tools": ["image_generate"]
"image_tools": ["image_generate"],
"browser_tools": [
"browser_navigate", "browser_snapshot", "browser_click",
"browser_type", "browser_scroll", "browser_back",
"browser_press", "browser_close", "browser_get_images",
"browser_vision"
]
}
legacy_tools = legacy_map.get(toolset_name, [])
tools_to_include.update(legacy_tools)
@ -404,13 +440,19 @@ def get_tool_definitions(
print(f"🚫 Disabled toolset '{toolset_name}': {', '.join(resolved_tools) if resolved_tools else 'no tools'}")
else:
# Try legacy compatibility
if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools"]:
if toolset_name in ["web_tools", "terminal_tools", "vision_tools", "moa_tools", "image_tools", "browser_tools"]:
legacy_map = {
"web_tools": ["web_search", "web_extract", "web_crawl"],
"web_tools": ["web_search", "web_extract"],
"terminal_tools": ["terminal"],
"vision_tools": ["vision_analyze"],
"moa_tools": ["mixture_of_agents"],
"image_tools": ["image_generate"]
"image_tools": ["image_generate"],
"browser_tools": [
"browser_navigate", "browser_snapshot", "browser_click",
"browser_type", "browser_scroll", "browser_back",
"browser_press", "browser_close", "browser_get_images",
"browser_vision"
]
}
legacy_tools = legacy_map.get(toolset_name, [])
tools_to_include.difference_update(legacy_tools)
@ -465,12 +507,6 @@ def handle_web_function_call(function_name: str, function_args: Dict[str, Any])
# Run async function in event loop
return asyncio.run(web_extract_tool(urls, "markdown"))
elif function_name == "web_crawl":
url = function_args.get("url", "")
instructions = function_args.get("instructions")
# Run async function in event loop
return asyncio.run(web_crawl_tool(url, instructions, "basic"))
else:
return json.dumps({"error": f"Unknown web function: {function_name}"}, ensure_ascii=False)
@ -603,7 +639,58 @@ def handle_image_function_call(function_name: str, function_args: Dict[str, Any]
return json.dumps({"error": f"Unknown image generation function: {function_name}"}, ensure_ascii=False)
def handle_function_call(function_name: str, function_args: Dict[str, Any], task_id: Optional[str] = None) -> str:
# Browser tool handlers mapping
BROWSER_HANDLERS = {
"browser_navigate": browser_navigate,
"browser_click": browser_click,
"browser_type": browser_type,
"browser_scroll": browser_scroll,
"browser_back": browser_back,
"browser_press": browser_press,
"browser_close": browser_close,
"browser_get_images": browser_get_images,
"browser_vision": browser_vision,
}
def handle_browser_function_call(
function_name: str,
function_args: Dict[str, Any],
task_id: Optional[str] = None,
user_task: Optional[str] = None
) -> str:
"""
Handle function calls for browser automation tools.
Args:
function_name (str): Name of the browser function to call
function_args (Dict): Arguments for the function
task_id (str): Task identifier for session isolation
user_task (str): User's current task (for task-aware extraction in snapshots)
Returns:
str: Function result as JSON string
"""
# Special handling for browser_snapshot which needs user_task for extraction
if function_name == "browser_snapshot":
full = function_args.get("full", False)
return browser_snapshot(full=full, task_id=task_id, user_task=user_task)
# Handle other browser tools
if function_name in BROWSER_HANDLERS:
handler = BROWSER_HANDLERS[function_name]
# Add task_id to args
return handler(**function_args, task_id=task_id)
return json.dumps({"error": f"Unknown browser function: {function_name}"}, ensure_ascii=False)
def handle_function_call(
function_name: str,
function_args: Dict[str, Any],
task_id: Optional[str] = None,
user_task: Optional[str] = None
) -> str:
"""
Main function call dispatcher that routes calls to appropriate toolsets.
@ -614,7 +701,8 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any], task
Args:
function_name (str): Name of the function to call
function_args (Dict): Arguments for the function
task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional)
task_id (str): Unique identifier for this task to isolate VMs/sessions between concurrent tasks (optional)
user_task (str): The user's original task/query (used for task-aware content extraction) (optional)
Returns:
str: Function result as JSON string
@ -624,7 +712,7 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any], task
"""
try:
# Route web tools
if function_name in ["web_search", "web_extract", "web_crawl"]:
if function_name in ["web_search", "web_extract"]:
return handle_web_function_call(function_name, function_args)
# Route terminal tools
@ -643,6 +731,15 @@ def handle_function_call(function_name: str, function_args: Dict[str, Any], task
elif function_name in ["image_generate"]:
return handle_image_function_call(function_name, function_args)
# Route browser automation tools
elif function_name in [
"browser_navigate", "browser_snapshot", "browser_click",
"browser_type", "browser_scroll", "browser_back",
"browser_press", "browser_close", "browser_get_images",
"browser_vision"
]:
return handle_browser_function_call(function_name, function_args, task_id, user_task)
else:
error_msg = f"Unknown function: {function_name}"
print(f"{error_msg}")
@ -664,8 +761,8 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
toolsets = {
"web_tools": {
"available": check_firecrawl_api_key(),
"tools": ["web_search_tool", "web_extract_tool", "web_crawl_tool"],
"description": "Web search, content extraction, and website crawling tools",
"tools": ["web_search_tool", "web_extract_tool"],
"description": "Web search and content extraction tools",
"requirements": ["FIRECRAWL_API_KEY environment variable"]
},
"terminal_tools": {
@ -691,6 +788,17 @@ def get_available_toolsets() -> Dict[str, Dict[str, Any]]:
"tools": ["image_generate_tool"],
"description": "Generate high-quality images from text prompts using FAL.ai's FLUX.1 Krea model with automatic 2x upscaling for enhanced quality",
"requirements": ["FAL_KEY environment variable", "fal-client package"]
},
"browser_tools": {
"available": check_browser_requirements(),
"tools": [
"browser_navigate", "browser_snapshot", "browser_click",
"browser_type", "browser_scroll", "browser_back",
"browser_press", "browser_close", "browser_get_images",
"browser_vision"
],
"description": "Browser automation for web interaction using agent-browser CLI with Browserbase cloud execution",
"requirements": ["BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID", "agent-browser npm package"]
}
}
@ -708,7 +816,8 @@ def check_toolset_requirements() -> Dict[str, bool]:
"terminal_tools": check_terminal_requirements(),
"vision_tools": check_vision_requirements(),
"moa_tools": check_moa_requirements(),
"image_tools": check_image_generation_requirements()
"image_tools": check_image_generation_requirements(),
"browser_tools": check_browser_requirements()
}
if __name__ == "__main__":

View File

@ -44,6 +44,7 @@ else:
# Import our tool system
from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
from tools.terminal_tool import cleanup_vm
from tools.browser_tool import cleanup_browser
class AIAgent:
@ -58,7 +59,7 @@ class AIAgent:
self,
base_url: str = None,
api_key: str = None,
model: str = "anthropic/claude-sonnet-4-20250514",
model: str = "anthropic/claude-sonnet-4-20250514", # OpenRouter format
max_iterations: int = 10,
tool_delay: float = 1.0,
enabled_toolsets: List[str] = None,
@ -156,10 +157,7 @@ class AIAgent:
client_kwargs["api_key"] = api_key
else:
# Primary: OPENROUTER_API_KEY, fallback to direct provider keys
client_kwargs["api_key"] = os.getenv(
"OPENROUTER_API_KEY",
os.getenv("ANTHROPIC_API_KEY", os.getenv("OPENAI_API_KEY", ""))
)
client_kwargs["api_key"] = os.getenv("OPENROUTER_API_KEY", "")
try:
self.client = OpenAI(**client_kwargs)
@ -339,11 +337,12 @@ class AIAgent:
# Check if this message has tool calls
if "tool_calls" in msg and msg["tool_calls"]:
# Format assistant message with tool calls
# Add <think> tags around reasoning for trajectory storage
content = ""
# Prepend reasoning in <think> tags if available
if msg.get("reasoning") and msg["reasoning"].strip():
content = f"<think>{msg['reasoning']}</think>"
content = f"<think>\n{msg['reasoning']}\n</think>\n"
if msg.get("content") and msg["content"].strip():
content += msg["content"] + "\n"
@ -406,17 +405,18 @@ class AIAgent:
else:
# Regular assistant message without tool calls
# Add <think> tags around reasoning for trajectory storage
content = ""
# Prepend reasoning in <think> tags if available
if msg.get("reasoning") and msg["reasoning"].strip():
content = f"<think>{msg['reasoning']}</think>"
content = f"<think>\n{msg['reasoning']}\n</think>\n"
content += msg["content"] or ""
trajectory.append({
"from": "gpt",
"value": content
"value": content.strip()
})
elif msg["role"] == "user":
@ -515,7 +515,31 @@ class AIAgent:
# Prepare messages for API call
# If we have an ephemeral system prompt, prepend it to the messages
api_messages = messages.copy()
# Note: Reasoning is embedded in content via <think> tags for trajectory storage.
# However, providers like Moonshot AI require a separate 'reasoning_content' field
# on assistant messages with tool_calls. We handle both cases here.
api_messages = []
for msg in messages:
api_msg = msg.copy()
# For assistant messages with tool_calls, providers require 'reasoning_content' field
# Extract reasoning from our stored 'reasoning' field and add it as 'reasoning_content'
if msg.get("role") == "assistant" and msg.get("tool_calls"):
reasoning_text = msg.get("reasoning")
if reasoning_text:
# Add reasoning_content for API compatibility (Moonshot AI, Novita, etc.)
api_msg["reasoning_content"] = reasoning_text
# Remove 'reasoning' field - it's for trajectory storage only
# The reasoning is already in the content via <think> tags AND
# we've added reasoning_content for API compatibility above
if "reasoning" in api_msg:
api_msg.pop("reasoning")
# Remove 'reasoning_details' if present - we use reasoning_content instead
if "reasoning_details" in api_msg:
api_msg.pop("reasoning_details")
api_messages.append(api_msg)
if active_system_prompt:
# Insert system message at the beginning
api_messages = [{"role": "system", "content": active_system_prompt}] + api_messages
@ -582,7 +606,9 @@ class AIAgent:
print(f"{self.log_prefix}⏱️ API call completed in {api_duration:.2f}s")
if self.verbose_logging:
logging.debug(f"API Response received - Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
# Log response with provider info if available
resp_model = getattr(response, 'model', 'N/A') if response else 'N/A'
logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
# Validate response has valid choices before proceeding
if response is None or not hasattr(response, 'choices') or response.choices is None or len(response.choices) == 0:
@ -600,12 +626,28 @@ class AIAgent:
# Check for error field in response (some providers include this)
error_msg = "Unknown"
provider_name = "Unknown"
if response and hasattr(response, 'error') and response.error:
error_msg = str(response.error)
# Try to extract provider from error metadata
if hasattr(response.error, 'metadata') and response.error.metadata:
provider_name = response.error.metadata.get('provider_name', 'Unknown')
elif response and hasattr(response, 'message') and response.message:
error_msg = str(response.message)
# Try to get provider from model field (OpenRouter often returns actual model used)
if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model:
provider_name = f"model={response.model}"
# Check for x-openrouter-provider or similar metadata
if provider_name == "Unknown" and response:
# Log all response attributes for debugging
resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')}
if self.verbose_logging:
logging.debug(f"Response attributes for invalid response: {resp_attrs}")
print(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}")
print(f"{self.log_prefix} 🏢 Provider: {provider_name}")
print(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}")
print(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)")
@ -623,7 +665,7 @@ class AIAgent:
# Longer backoff for rate limiting (likely cause of None choices)
wait_time = min(5 * (2 ** (retry_count - 1)), 120) # 5s, 10s, 20s, 40s, 80s, 120s
print(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...")
logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)}")
logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}")
time.sleep(wait_time)
continue # Retry the API call
@ -639,12 +681,17 @@ class AIAgent:
print(f"{self.log_prefix} ⏪ Rolling back to last complete assistant turn")
rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
# Clean up VM
# Clean up VM and browser
try:
cleanup_vm(effective_task_id)
except Exception as e:
if self.verbose_logging:
logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}")
try:
cleanup_browser(effective_task_id)
except Exception as e:
if self.verbose_logging:
logging.warning(f"Failed to cleanup browser for task {effective_task_id}: {e}")
return {
"final_response": None,
@ -799,17 +846,21 @@ class AIAgent:
self._invalid_json_retries = 0
# Extract reasoning from response if available (for reasoning models like minimax, kimi, etc.)
reasoning_content = None
# Extract reasoning from response for storage
# The reasoning_content field will be added when preparing API messages
reasoning_text = None
if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning:
reasoning_content = assistant_message.reasoning
reasoning_text = assistant_message.reasoning
elif hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content:
reasoning_content = assistant_message.reasoning_content
reasoning_text = assistant_message.reasoning_content
# Add assistant message with tool calls to conversation
messages.append({
# Build assistant message with tool calls
# Content stays as-is; reasoning is stored separately and will be passed
# to the API via reasoning_content field when preparing api_messages
assistant_msg = {
"role": "assistant",
"content": assistant_message.content,
"reasoning": reasoning_content, # Store reasoning for trajectory
"content": assistant_message.content or "",
"reasoning": reasoning_text, # Stored for trajectory extraction & API calls
"tool_calls": [
{
"id": tool_call.id,
@ -821,7 +872,9 @@ class AIAgent:
}
for tool_call in assistant_message.tool_calls
]
})
}
messages.append(assistant_msg)
# Execute each tool call
for i, tool_call in enumerate(assistant_message.tool_calls, 1):
@ -896,12 +949,17 @@ class AIAgent:
rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
# Clean up VM
# Clean up VM and browser
try:
cleanup_vm(effective_task_id)
except Exception as e:
if self.verbose_logging:
logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}")
try:
cleanup_browser(effective_task_id)
except Exception as e:
if self.verbose_logging:
logging.warning(f"Failed to cleanup browser for task {effective_task_id}: {e}")
return {
"final_response": None,
@ -917,18 +975,21 @@ class AIAgent:
self._empty_content_retries = 0
# Extract reasoning from response if available
reasoning_content = None
reasoning_text = None
if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning:
reasoning_content = assistant_message.reasoning
reasoning_text = assistant_message.reasoning
elif hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content:
reasoning_content = assistant_message.reasoning_content
reasoning_text = assistant_message.reasoning_content
# Add final assistant message
messages.append({
# Build final assistant message
# Content stays as-is; reasoning stored separately for trajectory extraction
final_msg = {
"role": "assistant",
"content": final_response,
"reasoning": reasoning_content # Store reasoning for trajectory
})
"reasoning": reasoning_text # Stored for trajectory extraction
}
messages.append(final_msg)
print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
break
@ -963,12 +1024,18 @@ class AIAgent:
# Save trajectory if enabled
self._save_trajectory(messages, user_message, completed)
# Clean up VM for this task after conversation completes
# Clean up VM and browser for this task after conversation completes
try:
cleanup_vm(effective_task_id)
except Exception as e:
if self.verbose_logging:
logging.warning(f"Failed to cleanup VM for task {effective_task_id}: {e}")
try:
cleanup_browser(effective_task_id)
except Exception as e:
if self.verbose_logging:
logging.warning(f"Failed to cleanup browser for task {effective_task_id}: {e}")
return {
"final_response": final_response,
@ -994,14 +1061,15 @@ class AIAgent:
def main(
query: str = None,
model: str = "claude-opus-4-20250514",
model: str = "anthropic/claude-sonnet-4-20250514",
api_key: str = None,
base_url: str = "https://api.anthropic.com/v1/",
base_url: str = "https://openrouter.ai/api/v1",
max_turns: int = 10,
enabled_toolsets: str = None,
disabled_toolsets: str = None,
list_tools: bool = False,
save_trajectories: bool = False,
save_sample: bool = False,
verbose: bool = False,
log_prefix_chars: int = 20
):
@ -1010,16 +1078,17 @@ def main(
Args:
query (str): Natural language query for the agent. Defaults to Python 3.13 example.
model (str): Model name to use. Defaults to claude-opus-4-20250514.
api_key (str): API key for authentication. Uses ANTHROPIC_API_KEY env var if not provided.
base_url (str): Base URL for the model API. Defaults to https://api.anthropic.com/v1/
model (str): Model name to use (OpenRouter format: provider/model). Defaults to anthropic/claude-sonnet-4-20250514.
api_key (str): API key for authentication. Uses OPENROUTER_API_KEY env var if not provided.
base_url (str): Base URL for the model API. Defaults to https://openrouter.ai/api/v1
max_turns (int): Maximum number of API call iterations. Defaults to 10.
enabled_toolsets (str): Comma-separated list of toolsets to enable. Supports predefined
toolsets (e.g., "research", "development", "safe").
Multiple toolsets can be combined: "web,vision"
disabled_toolsets (str): Comma-separated list of toolsets to disable (e.g., "terminal")
list_tools (bool): Just list available tools and exit
save_trajectories (bool): Save conversation trajectories to JSONL files. Defaults to False.
save_trajectories (bool): Save conversation trajectories to JSONL files (appends to trajectory_samples.jsonl). Defaults to False.
save_sample (bool): Save a single trajectory sample to a UUID-named JSONL file for inspection. Defaults to False.
verbose (bool): Enable verbose logging for debugging. Defaults to False.
log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses. Defaults to 20.
@ -1173,6 +1242,34 @@ def main(
print("-" * 30)
print(result['final_response'])
# Save sample trajectory to UUID-named file if requested
if save_sample:
import uuid
sample_id = str(uuid.uuid4())[:8]
sample_filename = f"sample_{sample_id}.jsonl"
# Convert messages to trajectory format (same as batch_runner)
trajectory = agent._convert_to_trajectory_format(
result['messages'],
user_query,
result['completed']
)
entry = {
"conversations": trajectory,
"timestamp": datetime.now().isoformat(),
"model": model,
"completed": result['completed'],
"query": user_query
}
try:
with open(sample_filename, "w", encoding="utf-8") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"\n💾 Sample trajectory saved to: {sample_filename}")
except Exception as e:
print(f"\n⚠️ Failed to save sample: {e}")
print("\n👋 Agent execution completed!")

View File

@ -0,0 +1,299 @@
#!/usr/bin/env python3
"""
Test Modal Terminal Tool
This script tests that the Modal terminal backend is correctly configured
and can execute commands in Modal sandboxes.
Usage:
# Run with Modal backend
TERMINAL_ENV=modal python tests/test_modal_terminal.py
# Or run directly (will use whatever TERMINAL_ENV is set in .env)
python tests/test_modal_terminal.py
"""
import os
import sys
import json
from pathlib import Path
# Try to load .env file if python-dotenv is available
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
# Manually load .env if dotenv not available
env_file = Path(__file__).parent.parent / ".env"
if env_file.exists():
with open(env_file) as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
key, value = line.split('=', 1)
# Remove quotes if present
value = value.strip().strip('"').strip("'")
os.environ.setdefault(key.strip(), value)
# Add parent directory to path for imports
parent_dir = Path(__file__).parent.parent
sys.path.insert(0, str(parent_dir))
sys.path.insert(0, str(parent_dir / "mini-swe-agent" / "src"))
# Import terminal_tool module directly using importlib to avoid tools/__init__.py
import importlib.util
terminal_tool_path = parent_dir / "tools" / "terminal_tool.py"
spec = importlib.util.spec_from_file_location("terminal_tool", terminal_tool_path)
terminal_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(terminal_module)
terminal_tool = terminal_module.terminal_tool
check_terminal_requirements = terminal_module.check_terminal_requirements
_get_env_config = terminal_module._get_env_config
cleanup_vm = terminal_module.cleanup_vm
get_active_environments_info = terminal_module.get_active_environments_info
def test_modal_requirements():
"""Test that Modal requirements are met."""
print("\n" + "=" * 60)
print("TEST 1: Modal Requirements Check")
print("=" * 60)
config = _get_env_config()
print(f"Current TERMINAL_ENV: {config['env_type']}")
print(f"Modal image: {config['modal_image']}")
# Check for Modal authentication
modal_token = os.getenv("MODAL_TOKEN_ID")
modal_toml = Path.home() / ".modal.toml"
print(f"\nModal authentication:")
print(f" MODAL_TOKEN_ID env var: {'✅ Set' if modal_token else '❌ Not set'}")
print(f" ~/.modal.toml file: {'✅ Exists' if modal_toml.exists() else '❌ Not found'}")
if config['env_type'] != 'modal':
print(f"\n⚠️ TERMINAL_ENV is '{config['env_type']}', not 'modal'")
print(" Set TERMINAL_ENV=modal in .env or export it to test Modal backend")
return False
requirements_met = check_terminal_requirements()
print(f"\nRequirements check: {'✅ Passed' if requirements_met else '❌ Failed'}")
return requirements_met
def test_simple_command():
"""Test executing a simple command."""
print("\n" + "=" * 60)
print("TEST 2: Simple Command Execution")
print("=" * 60)
test_task_id = "modal_test_simple"
print("Executing: echo 'Hello from Modal!'")
result = terminal_tool("echo 'Hello from Modal!'", task_id=test_task_id)
result_json = json.loads(result)
print(f"\nResult:")
print(f" Output: {result_json.get('output', '')[:200]}")
print(f" Exit code: {result_json.get('exit_code')}")
print(f" Error: {result_json.get('error')}")
success = result_json.get('exit_code') == 0 and 'Hello from Modal!' in result_json.get('output', '')
print(f"\nTest: {'✅ Passed' if success else '❌ Failed'}")
# Cleanup
cleanup_vm(test_task_id)
return success
def test_python_execution():
"""Test executing Python code in Modal."""
print("\n" + "=" * 60)
print("TEST 3: Python Execution")
print("=" * 60)
test_task_id = "modal_test_python"
python_cmd = 'python3 -c "import sys; print(f\'Python {sys.version}\')"'
print(f"Executing: {python_cmd}")
result = terminal_tool(python_cmd, task_id=test_task_id)
result_json = json.loads(result)
print(f"\nResult:")
print(f" Output: {result_json.get('output', '')[:200]}")
print(f" Exit code: {result_json.get('exit_code')}")
print(f" Error: {result_json.get('error')}")
success = result_json.get('exit_code') == 0 and 'Python' in result_json.get('output', '')
print(f"\nTest: {'✅ Passed' if success else '❌ Failed'}")
# Cleanup
cleanup_vm(test_task_id)
return success
def test_pip_install():
"""Test installing a package with pip in Modal."""
print("\n" + "=" * 60)
print("TEST 4: Pip Install Test")
print("=" * 60)
test_task_id = "modal_test_pip"
# Install a small package and verify
print("Executing: pip install --break-system-packages cowsay && python3 -c \"import cowsay; cowsay.cow('Modal works!')\"")
result = terminal_tool(
"pip install --break-system-packages cowsay && python3 -c \"import cowsay; cowsay.cow('Modal works!')\"",
task_id=test_task_id,
timeout=120
)
result_json = json.loads(result)
print(f"\nResult:")
output = result_json.get('output', '')
print(f" Output (last 500 chars): ...{output[-500:] if len(output) > 500 else output}")
print(f" Exit code: {result_json.get('exit_code')}")
print(f" Error: {result_json.get('error')}")
success = result_json.get('exit_code') == 0 and 'Modal works!' in result_json.get('output', '')
print(f"\nTest: {'✅ Passed' if success else '❌ Failed'}")
# Cleanup
cleanup_vm(test_task_id)
return success
def test_filesystem_persistence():
"""Test that filesystem persists between commands in the same task."""
print("\n" + "=" * 60)
print("TEST 5: Filesystem Persistence")
print("=" * 60)
test_task_id = "modal_test_persist"
# Create a file
print("Step 1: Creating test file...")
result1 = terminal_tool("echo 'persistence test' > /tmp/modal_test.txt", task_id=test_task_id)
result1_json = json.loads(result1)
print(f" Exit code: {result1_json.get('exit_code')}")
# Read the file back
print("Step 2: Reading test file...")
result2 = terminal_tool("cat /tmp/modal_test.txt", task_id=test_task_id)
result2_json = json.loads(result2)
print(f" Output: {result2_json.get('output', '')}")
print(f" Exit code: {result2_json.get('exit_code')}")
success = (
result1_json.get('exit_code') == 0 and
result2_json.get('exit_code') == 0 and
'persistence test' in result2_json.get('output', '')
)
print(f"\nTest: {'✅ Passed' if success else '❌ Failed'}")
# Cleanup
cleanup_vm(test_task_id)
return success
def test_environment_isolation():
"""Test that different task_ids get isolated environments."""
print("\n" + "=" * 60)
print("TEST 6: Environment Isolation")
print("=" * 60)
task1 = "modal_test_iso_1"
task2 = "modal_test_iso_2"
# Create file in task1
print("Step 1: Creating file in task1...")
result1 = terminal_tool("echo 'task1 data' > /tmp/isolated.txt", task_id=task1)
# Try to read from task2 (should not exist)
print("Step 2: Trying to read file from task2 (should not exist)...")
result2 = terminal_tool("cat /tmp/isolated.txt 2>&1 || echo 'FILE_NOT_FOUND'", task_id=task2)
result2_json = json.loads(result2)
# The file should either not exist or be empty in task2
output = result2_json.get('output', '')
isolated = 'task1 data' not in output or 'FILE_NOT_FOUND' in output or 'No such file' in output
print(f" Task2 output: {output[:200]}")
print(f"\nTest: {'✅ Passed (environments isolated)' if isolated else '❌ Failed (environments NOT isolated)'}")
# Cleanup
cleanup_vm(task1)
cleanup_vm(task2)
return isolated
def main():
"""Run all Modal terminal tests."""
print("🧪 Modal Terminal Tool Test Suite")
print("=" * 60)
# Check current config
config = _get_env_config()
print(f"\nCurrent configuration:")
print(f" TERMINAL_ENV: {config['env_type']}")
print(f" TERMINAL_MODAL_IMAGE: {config['modal_image']}")
print(f" TERMINAL_TIMEOUT: {config['timeout']}s")
if config['env_type'] != 'modal':
print(f"\n⚠️ WARNING: TERMINAL_ENV is set to '{config['env_type']}', not 'modal'")
print(" To test Modal specifically, set TERMINAL_ENV=modal")
response = input("\n Continue testing with current backend? (y/n): ")
if response.lower() != 'y':
print("Aborting.")
return
results = {}
# Run tests
results['requirements'] = test_modal_requirements()
if not results['requirements']:
print("\n❌ Requirements not met. Cannot continue with other tests.")
return
results['simple_command'] = test_simple_command()
results['python_execution'] = test_python_execution()
results['pip_install'] = test_pip_install()
results['filesystem_persistence'] = test_filesystem_persistence()
results['environment_isolation'] = test_environment_isolation()
# Summary
print("\n" + "=" * 60)
print("TEST SUMMARY")
print("=" * 60)
passed = sum(1 for v in results.values() if v)
total = len(results)
for test_name, passed_test in results.items():
status = "✅ PASSED" if passed_test else "❌ FAILED"
print(f" {test_name}: {status}")
print(f"\nTotal: {passed}/{total} tests passed")
# Show active environments
env_info = get_active_environments_info()
print(f"\nActive environments after tests: {env_info['count']}")
if env_info['count'] > 0:
print(f" Task IDs: {env_info['task_ids']}")
return passed == total
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)

View File

@ -24,11 +24,13 @@ from .web_tools import (
check_firecrawl_api_key
)
# Primary terminal tool (mini-swe-agent backend: local/docker/modal)
# Primary terminal tool (mini-swe-agent backend: local/docker/singularity/modal)
from .terminal_tool import (
terminal_tool,
check_terminal_requirements,
cleanup_vm,
cleanup_all_environments,
get_active_environments_info,
TERMINAL_TOOL_DESCRIPTION
)
@ -54,6 +56,25 @@ from .image_generation_tool import (
check_image_generation_requirements
)
# Browser automation tools (agent-browser + Browserbase)
from .browser_tool import (
browser_navigate,
browser_snapshot,
browser_click,
browser_type,
browser_scroll,
browser_back,
browser_press,
browser_close,
browser_get_images,
browser_vision,
cleanup_browser,
cleanup_all_browsers,
get_active_browser_sessions,
check_browser_requirements,
BROWSER_TOOL_SCHEMAS
)
__all__ = [
# Web tools
'web_search_tool',
@ -64,6 +85,8 @@ __all__ = [
'terminal_tool',
'check_terminal_requirements',
'cleanup_vm',
'cleanup_all_environments',
'get_active_environments_info',
'TERMINAL_TOOL_DESCRIPTION',
# Terminal tools (Hecate/MorphCloud backend)
'terminal_hecate_tool',
@ -78,5 +101,21 @@ __all__ = [
# Image generation tools
'image_generate_tool',
'check_image_generation_requirements',
# Browser automation tools
'browser_navigate',
'browser_snapshot',
'browser_click',
'browser_type',
'browser_scroll',
'browser_back',
'browser_press',
'browser_close',
'browser_get_images',
'browser_vision',
'cleanup_browser',
'cleanup_all_browsers',
'get_active_browser_sessions',
'check_browser_requirements',
'BROWSER_TOOL_SCHEMAS',
]

1454
tools/browser_tool.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -32,6 +32,10 @@ import sys
import time
import threading
import atexit
import shutil
import subprocess
import tempfile
import uuid
from pathlib import Path
from typing import Optional, Dict, Any
@ -40,6 +44,168 @@ mini_swe_path = Path(__file__).parent.parent / "mini-swe-agent" / "src"
if mini_swe_path.exists():
sys.path.insert(0, str(mini_swe_path))
# =============================================================================
# Custom Singularity Environment with more space
# =============================================================================
def _get_scratch_dir() -> Path:
"""Get the best directory for Singularity sandboxes - prefers /scratch if available."""
# Check for configurable scratch directory first (highest priority)
custom_scratch = os.getenv("TERMINAL_SCRATCH_DIR")
if custom_scratch:
scratch_path = Path(custom_scratch)
scratch_path.mkdir(parents=True, exist_ok=True)
return scratch_path
# Check for /scratch (common on HPC clusters, especially GPU nodes)
scratch = Path("/scratch")
if scratch.exists() and os.access(scratch, os.W_OK):
# Create user-specific subdirectory
user_scratch = scratch / os.getenv("USER", "hermes") / "hermes-agent"
user_scratch.mkdir(parents=True, exist_ok=True)
print(f"[Terminal] Using /scratch for sandboxes: {user_scratch}")
return user_scratch
# Fall back to /tmp
print("[Terminal] Warning: /scratch not available, using /tmp (limited space)")
return Path(tempfile.gettempdir())
# Disk usage warning threshold (in GB)
DISK_USAGE_WARNING_THRESHOLD_GB = float(os.getenv("TERMINAL_DISK_WARNING_GB", "500"))
def _check_disk_usage_warning():
"""Check if total disk usage exceeds warning threshold."""
scratch_dir = _get_scratch_dir()
try:
# Get total size of hermes directories
total_bytes = 0
import glob
for path in glob.glob(str(scratch_dir / "hermes-*")):
for f in Path(path).rglob('*'):
if f.is_file():
try:
total_bytes += f.stat().st_size
except:
pass
total_gb = total_bytes / (1024 ** 3)
if total_gb > DISK_USAGE_WARNING_THRESHOLD_GB:
print(f"⚠️ [Terminal] WARNING: Disk usage ({total_gb:.1f}GB) exceeds threshold ({DISK_USAGE_WARNING_THRESHOLD_GB}GB)")
print(f" Consider running cleanup_all_environments() or reducing parallel workers")
return True
return False
except Exception as e:
return False
class _SingularityEnvironment:
"""
Custom Singularity/Apptainer environment with better space management.
- Builds sandbox in /scratch (if available) or configurable location
- Binds a large working directory into the container
- Keeps container isolated from host filesystem
"""
def __init__(self, image: str, cwd: str = "/workspace", timeout: int = 60):
self.image = image
self.cwd = cwd
self.timeout = timeout
# Use apptainer if available, otherwise singularity
self.executable = "apptainer" if shutil.which("apptainer") else "singularity"
# Get scratch directory for sandbox
self.scratch_dir = _get_scratch_dir()
# Create unique sandbox directory
self.sandbox_id = f"hermes-{uuid.uuid4().hex[:12]}"
self.sandbox_dir = self.scratch_dir / self.sandbox_id
# Create a working directory that will be bound into the container
self.work_dir = self.scratch_dir / f"{self.sandbox_id}-work"
self.work_dir.mkdir(parents=True, exist_ok=True)
# Build the sandbox
self._build_sandbox()
def _build_sandbox(self):
"""Build a writable sandbox from the container image."""
try:
result = subprocess.run(
[self.executable, "build", "--sandbox", str(self.sandbox_dir), self.image],
capture_output=True,
text=True,
timeout=300 # 5 min timeout for building
)
if result.returncode != 0:
raise RuntimeError(f"Failed to build sandbox: {result.stderr}")
# Create /workspace directory inside the sandbox for bind mounting
workspace_in_sandbox = self.sandbox_dir / "workspace"
workspace_in_sandbox.mkdir(parents=True, exist_ok=True)
except subprocess.TimeoutExpired:
shutil.rmtree(self.sandbox_dir, ignore_errors=True)
raise RuntimeError("Sandbox build timed out")
def execute(self, command: str, cwd: str = "", *, timeout: int | None = None) -> dict:
"""Execute a command in the Singularity container."""
cmd = [self.executable, "exec"]
# Isolation flags - contain but allow network
cmd.extend(["--contain", "--cleanenv"])
# Bind the working directory into the container at /workspace
# This gives the container access to a large writable space
cmd.extend(["--bind", f"{self.work_dir}:/workspace"])
# Also bind it to /tmp inside container for pip cache etc.
cmd.extend(["--bind", f"{self.work_dir}:/tmp"])
# Set working directory
work_dir = cwd or self.cwd
cmd.extend(["--pwd", work_dir])
# Use writable sandbox
cmd.extend(["--writable", str(self.sandbox_dir)])
# Execute the command
cmd.extend(["bash", "-c", command])
try:
result = subprocess.run(
cmd,
text=True,
timeout=timeout or self.timeout,
encoding="utf-8",
errors="replace",
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
)
return {"output": result.stdout, "returncode": result.returncode}
except subprocess.TimeoutExpired:
return {"output": f"Command timed out after {timeout or self.timeout}s", "returncode": 124}
def cleanup(self):
"""Clean up sandbox and working directory."""
shutil.rmtree(self.sandbox_dir, ignore_errors=True)
shutil.rmtree(self.work_dir, ignore_errors=True)
def stop(self):
"""Alias for cleanup."""
self.cleanup()
def __del__(self):
"""Cleanup on destruction."""
self.cleanup()
# Tool description for LLM
TERMINAL_TOOL_DESCRIPTION = """Execute commands on a secure Linux environment.
@ -71,6 +237,7 @@ TERMINAL_TOOL_DESCRIPTION = """Execute commands on a secure Linux environment.
# Global state for environment lifecycle management
_active_environments: Dict[str, Any] = {}
_task_workdirs: Dict[str, str] = {} # Maps task_id to working directory
_last_activity: Dict[str, float] = {}
_env_lock = threading.Lock()
_cleanup_thread = None
@ -80,9 +247,10 @@ _cleanup_running = False
def _get_env_config() -> Dict[str, Any]:
"""Get terminal environment configuration from environment variables."""
return {
"env_type": os.getenv("TERMINAL_ENV", "local"), # local, docker, or modal
"docker_image": os.getenv("TERMINAL_DOCKER_IMAGE", "python:3.11-slim"),
"modal_image": os.getenv("TERMINAL_MODAL_IMAGE", "python:3.11-slim"),
"env_type": os.getenv("TERMINAL_ENV", "local"), # local, docker, singularity, or modal
"docker_image": os.getenv("TERMINAL_DOCKER_IMAGE", "python:3.11"),
"singularity_image": os.getenv("TERMINAL_SINGULARITY_IMAGE", "docker://python:3.11"),
"modal_image": os.getenv("TERMINAL_MODAL_IMAGE", "python:3.11"),
"cwd": os.getenv("TERMINAL_CWD", "/tmp"),
"timeout": int(os.getenv("TERMINAL_TIMEOUT", "60")),
"lifetime_seconds": int(os.getenv("TERMINAL_LIFETIME_SECONDS", "300")),
@ -94,8 +262,8 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int):
Create an execution environment from mini-swe-agent.
Args:
env_type: One of "local", "docker", "modal"
image: Docker/Modal image name (ignored for local)
env_type: One of "local", "docker", "singularity", "modal"
image: Docker/Singularity/Modal image name (ignored for local)
cwd: Working directory
timeout: Default command timeout
@ -110,12 +278,16 @@ def _create_environment(env_type: str, image: str, cwd: str, timeout: int):
from minisweagent.environments.docker import DockerEnvironment
return DockerEnvironment(image=image, cwd=cwd, timeout=timeout)
elif env_type == "singularity":
# Use custom Singularity environment with better space management
return _SingularityEnvironment(image=image, cwd=cwd, timeout=timeout)
elif env_type == "modal":
from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment
return SwerexModalEnvironment(image=image, cwd=cwd, timeout=timeout)
else:
raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', or 'modal'")
raise ValueError(f"Unknown environment type: {env_type}. Use 'local', 'docker', 'singularity', or 'modal'")
def _cleanup_inactive_envs(lifetime_seconds: int = 300):
@ -147,6 +319,8 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
if task_id in _last_activity:
del _last_activity[task_id]
if task_id in _task_workdirs:
del _task_workdirs[task_id]
except Exception as e:
error_str = str(e)
@ -160,6 +334,8 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
del _active_environments[task_id]
if task_id in _last_activity:
del _last_activity[task_id]
if task_id in _task_workdirs:
del _task_workdirs[task_id]
def _cleanup_thread_worker():
@ -198,9 +374,63 @@ def _stop_cleanup_thread():
_cleanup_thread.join(timeout=5)
def get_active_environments_info() -> Dict[str, Any]:
"""Get information about currently active environments."""
info = {
"count": len(_active_environments),
"task_ids": list(_active_environments.keys()),
"workdirs": dict(_task_workdirs),
}
# Calculate total disk usage
total_size = 0
for task_id in _active_environments.keys():
# Check sandbox and workdir sizes
scratch_dir = _get_scratch_dir()
for pattern in [f"hermes-*{task_id[:8]}*"]:
import glob
for path in glob.glob(str(scratch_dir / "hermes-*")):
try:
size = sum(f.stat().st_size for f in Path(path).rglob('*') if f.is_file())
total_size += size
except:
pass
info["total_disk_usage_mb"] = round(total_size / (1024 * 1024), 2)
return info
def cleanup_all_environments():
"""Clean up ALL active environments. Use with caution."""
global _active_environments, _last_activity, _task_workdirs
task_ids = list(_active_environments.keys())
cleaned = 0
for task_id in task_ids:
try:
cleanup_vm(task_id)
cleaned += 1
except Exception as e:
print(f"[Terminal Cleanup] Error cleaning {task_id}: {e}")
# Also clean any orphaned directories
scratch_dir = _get_scratch_dir()
import glob
for path in glob.glob(str(scratch_dir / "hermes-*")):
try:
shutil.rmtree(path, ignore_errors=True)
print(f"[Terminal Cleanup] Removed orphaned: {path}")
except:
pass
print(f"[Terminal Cleanup] Cleaned {cleaned} environments")
return cleaned
def cleanup_vm(task_id: str):
"""Manually clean up a specific environment by task_id."""
global _active_environments, _last_activity
global _active_environments, _last_activity, _task_workdirs
with _env_lock:
try:
@ -216,6 +446,9 @@ def cleanup_vm(task_id: str):
del _active_environments[task_id]
print(f"[Terminal Cleanup] Manually cleaned up environment for task: {task_id}")
if task_id in _task_workdirs:
del _task_workdirs[task_id]
if task_id in _last_activity:
del _last_activity[task_id]
@ -268,6 +501,8 @@ def terminal_tool(
# Select image based on env type
if env_type == "docker":
image = config["docker_image"]
elif env_type == "singularity":
image = config["singularity_image"]
elif env_type == "modal":
image = config["modal_image"]
else:
@ -280,12 +515,26 @@ def terminal_tool(
# Use task_id for environment isolation
effective_task_id = task_id or "default"
# For local environment, create a unique subdirectory per task
# This prevents parallel tasks from overwriting each other's files
if env_type == "local":
import uuid
with _env_lock:
if effective_task_id not in _task_workdirs:
task_workdir = Path(cwd) / f"hermes-{effective_task_id}-{uuid.uuid4().hex[:8]}"
task_workdir.mkdir(parents=True, exist_ok=True)
_task_workdirs[effective_task_id] = str(task_workdir)
cwd = _task_workdirs[effective_task_id]
# Start cleanup thread
_start_cleanup_thread()
# Get or create environment
with _env_lock:
if effective_task_id not in _active_environments:
# Check disk usage before creating new environment
_check_disk_usage_warning()
try:
_active_environments[effective_task_id] = _create_environment(
env_type=env_type,
@ -397,6 +646,16 @@ def check_terminal_requirements() -> bool:
import subprocess
result = subprocess.run(["docker", "version"], capture_output=True, timeout=5)
return result.returncode == 0
elif env_type == "singularity":
from minisweagent.environments.singularity import SingularityEnvironment
# Check if singularity/apptainer is available
import subprocess
import shutil
executable = shutil.which("apptainer") or shutil.which("singularity")
if executable:
result = subprocess.run([executable, "--version"], capture_output=True, timeout=5)
return result.returncode == 0
return False
elif env_type == "modal":
from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment
# Check for modal token

View File

@ -155,10 +155,14 @@ async def _download_image(image_url: str, destination: Path, max_retries: int =
for attempt in range(max_retries):
try:
# Download the image with appropriate headers using async httpx
async with httpx.AsyncClient(timeout=30.0) as client:
# Enable follow_redirects to handle image CDNs that redirect (e.g., Imgur, Picsum)
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
response = await client.get(
image_url,
headers={"User-Agent": "hermes-agent-vision/1.0"},
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "image/*,*/*;q=0.8",
},
)
response.raise_for_status()

View File

@ -35,7 +35,8 @@ DISTRIBUTIONS = {
"vision": 100,
"image_gen": 100,
"terminal": 100,
"moa": 100
"moa": 100,
"browser": 100
}
},
@ -55,22 +56,24 @@ DISTRIBUTIONS = {
"research": {
"description": "Web research with vision analysis and reasoning",
"toolsets": {
"web": 90, # 90% chance of web tools
"vision": 50, # 50% chance of vision tools
"moa": 40, # 40% chance of reasoning tools
"terminal": 10 # 10% chance of terminal tools
"web": 90, # 90% chance of web tools
"browser": 70, # 70% chance of browser tools for deep research
"vision": 50, # 50% chance of vision tools
"moa": 40, # 40% chance of reasoning tools
"terminal": 10 # 10% chance of terminal tools
}
},
# Scientific problem solving focused distribution
"science": {
"description": "Web research with vision analysis and reasoning",
"description": "Scientific research with web, terminal, and browser capabilities",
"toolsets": {
"web": 94, # 90% chance of web tools
"vision": 65, # 50% chance of vision tools
"moa": 10, # 40% chance of reasoning tools
"terminal": 94, # 10% chance of terminal tools
"image_gen": 15 # 80% chance of image generation tools
"web": 94, # 94% chance of web tools
"terminal": 94, # 94% chance of terminal tools
"vision": 65, # 65% chance of vision tools
"browser": 50, # 50% chance of browser for accessing papers/databases
"image_gen": 15, # 15% chance of image generation tools
"moa": 10 # 10% chance of reasoning tools
}
},
@ -90,6 +93,7 @@ DISTRIBUTIONS = {
"description": "All tools except terminal for safety",
"toolsets": {
"web": 80,
"browser": 70, # Browser is safe (no local filesystem access)
"vision": 60,
"image_gen": 60,
"moa": 50
@ -104,7 +108,8 @@ DISTRIBUTIONS = {
"vision": 50,
"image_gen": 50,
"terminal": 50,
"moa": 50
"moa": 50,
"browser": 50
}
},
@ -116,6 +121,23 @@ DISTRIBUTIONS = {
}
},
# Terminal only
"terminal_only": {
"description": "Only terminal tool for code execution tasks",
"toolsets": {
"terminal": 100
}
},
# Terminal + web (common for coding tasks that need docs)
"terminal_web": {
"description": "Terminal with web search for documentation lookup",
"toolsets": {
"terminal": 100,
"web": 100
}
},
# Creative (vision + image generation)
"creative": {
"description": "Image generation and vision analysis focus",
@ -134,6 +156,58 @@ DISTRIBUTIONS = {
"web": 30,
"terminal": 20
}
},
# Browser-based web interaction
"browser_use": {
"description": "Full browser-based web interaction with search, vision, and page control",
"toolsets": {
"browser": 100, # All browser tools always available
"web": 80, # Web search for finding URLs and quick lookups
"vision": 70 # Vision analysis for images found on pages
}
},
# Browser only (no other tools)
"browser_only": {
"description": "Only browser automation tools for pure web interaction tasks",
"toolsets": {
"browser": 100
}
},
# Browser-focused tasks distribution (for browser-use-tasks.jsonl)
"browser_tasks": {
"description": "Browser-focused distribution (browser toolset includes web_search for finding URLs since Google blocks direct browser searches)",
"toolsets": {
"browser": 97, # 97% - browser tools (includes web_search) almost always available
"vision": 12, # 12% - vision analysis occasionally
"terminal": 15 # 15% - terminal occasionally for local operations
}
},
# Terminal-focused tasks distribution (for nous-terminal-tasks.jsonl)
"terminal_tasks": {
"description": "Terminal-focused distribution with high terminal availability, occasional other tools",
"toolsets": {
"terminal": 97, # 97% - terminal almost always available
"web": 15, # 15% - web search/scrape for documentation
"browser": 10, # 10% - browser occasionally for web interaction
"vision": 8, # 8% - vision analysis rarely
"image_gen": 3 # 3% - image generation very rarely
}
},
# Mixed browser+terminal tasks distribution (for mixed-browser-terminal-tasks.jsonl)
"mixed_tasks": {
"description": "Mixed distribution with high browser and terminal availability for complex tasks",
"toolsets": {
"browser": 92, # 92% - browser tools highly available
"terminal": 92, # 92% - terminal highly available
"web": 35, # 35% - web search/scrape fairly common
"vision": 15, # 15% - vision analysis occasionally
"image_gen": 15 # 15% - image generation occasionally
}
}
}

View File

@ -33,10 +33,16 @@ TOOLSETS = {
# Basic toolsets - individual tool categories
"web": {
"description": "Web research and content extraction tools",
"tools": ["web_search", "web_extract", "web_crawl"],
"tools": ["web_search", "web_extract"],
"includes": [] # No other toolsets included
},
"search": {
"description": "Web search only (no content extraction/scraping)",
"tools": ["web_search"],
"includes": []
},
"vision": {
"description": "Image analysis and vision tools",
"tools": ["vision_analyze"],
@ -61,6 +67,17 @@ TOOLSETS = {
"includes": []
},
"browser": {
"description": "Browser automation for web interaction (navigate, click, type, scroll, iframes, hold-click) with web search for finding URLs",
"tools": [
"browser_navigate", "browser_snapshot", "browser_click",
"browser_type", "browser_scroll", "browser_back",
"browser_press", "browser_close", "browser_get_images",
"browser_vision", "web_search"
],
"includes": []
},
# Scenario-specific toolsets
"debugging": {