674 lines
22 KiB
Python
674 lines
22 KiB
Python
"""Firecrawl configuration widgets for advanced scraping options."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import cast
|
|
|
|
from textual.app import ComposeResult
|
|
from textual.containers import Container, Horizontal
|
|
from textual.validation import Integer
|
|
from textual.widgets import Button, Checkbox, Input, Label, Switch, TextArea
|
|
from typing_extensions import override
|
|
|
|
from ..models import FirecrawlOptions
|
|
|
|
|
|
class ScrapeOptionsForm(Container):
|
|
"""Form for configuring Firecrawl scraping options."""
|
|
|
|
DEFAULT_CSS = """
|
|
ScrapeOptionsForm {
|
|
border: solid $border;
|
|
background: $surface;
|
|
padding: 1;
|
|
height: auto;
|
|
}
|
|
|
|
ScrapeOptionsForm .form-section {
|
|
margin-bottom: 2;
|
|
padding: 1;
|
|
border: solid $border-lighten-1;
|
|
background: $surface-lighten-1;
|
|
}
|
|
|
|
ScrapeOptionsForm .form-row {
|
|
layout: horizontal;
|
|
align-items: center;
|
|
height: auto;
|
|
margin-bottom: 1;
|
|
}
|
|
|
|
ScrapeOptionsForm .form-label {
|
|
width: 30%;
|
|
min-width: 15;
|
|
text-align: right;
|
|
padding-right: 2;
|
|
}
|
|
|
|
ScrapeOptionsForm .form-input {
|
|
width: 70%;
|
|
}
|
|
|
|
ScrapeOptionsForm .checkbox-row {
|
|
layout: horizontal;
|
|
align-items: center;
|
|
height: 3;
|
|
margin-bottom: 1;
|
|
}
|
|
|
|
ScrapeOptionsForm .checkbox-label {
|
|
margin-left: 2;
|
|
}
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
name: str | None = None,
|
|
id: str | None = None,
|
|
classes: str | None = None,
|
|
disabled: bool = False,
|
|
markup: bool = True,
|
|
) -> None:
|
|
"""Initialize scrape options form."""
|
|
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
|
|
|
|
@override
|
|
def compose(self) -> ComposeResult:
|
|
"""Compose scrape options form."""
|
|
yield Label("🔧 Scraping Configuration", classes="form-title")
|
|
|
|
# Output formats section
|
|
yield Container(
|
|
Label("Output Formats", classes="section-title"),
|
|
Horizontal(
|
|
Checkbox("Markdown", id="format_markdown", value=True, classes="checkbox"),
|
|
Label("Markdown", classes="checkbox-label"),
|
|
classes="checkbox-row",
|
|
),
|
|
Horizontal(
|
|
Checkbox("HTML", id="format_html", value=False, classes="checkbox"),
|
|
Label("HTML", classes="checkbox-label"),
|
|
classes="checkbox-row",
|
|
),
|
|
Horizontal(
|
|
Checkbox("Screenshot", id="format_screenshot", value=False, classes="checkbox"),
|
|
Label("Screenshot", classes="checkbox-label"),
|
|
classes="checkbox-row",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
# Content filtering section
|
|
yield Container(
|
|
Label("Content Filtering", classes="section-title"),
|
|
Horizontal(
|
|
Label("Only Main Content:", classes="form-label"),
|
|
Switch(id="only_main_content", value=True, classes="form-input"),
|
|
classes="form-row",
|
|
),
|
|
Horizontal(
|
|
Label("Include Tags:", classes="form-label"),
|
|
Input(
|
|
placeholder="p, div, article (comma-separated)",
|
|
id="include_tags",
|
|
classes="form-input",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
Horizontal(
|
|
Label("Exclude Tags:", classes="form-label"),
|
|
Input(
|
|
placeholder="nav, footer, script (comma-separated)",
|
|
id="exclude_tags",
|
|
classes="form-input",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
# Performance settings section
|
|
yield Container(
|
|
Label("Performance Settings", classes="section-title"),
|
|
Horizontal(
|
|
Label("Wait Time (ms):", classes="form-label"),
|
|
Input(
|
|
placeholder="0",
|
|
id="wait_for",
|
|
validators=[Integer(minimum=0, maximum=30000)],
|
|
classes="form-input",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
def get_scrape_options(self) -> dict[str, object]:
|
|
"""Get scraping options from form."""
|
|
# Collect formats
|
|
formats = []
|
|
if self.query_one("#format_markdown", Checkbox).value:
|
|
formats.append("markdown")
|
|
if self.query_one("#format_html", Checkbox).value:
|
|
formats.append("html")
|
|
if self.query_one("#format_screenshot", Checkbox).value:
|
|
formats.append("screenshot")
|
|
options: dict[str, object] = {
|
|
"formats": formats,
|
|
"only_main_content": self.query_one(
|
|
"#only_main_content", Switch
|
|
).value,
|
|
}
|
|
include_tags_input = self.query_one("#include_tags", Input).value
|
|
if include_tags_input.strip():
|
|
options["include_tags"] = [tag.strip() for tag in include_tags_input.split(",")]
|
|
|
|
exclude_tags_input = self.query_one("#exclude_tags", Input).value
|
|
if exclude_tags_input.strip():
|
|
options["exclude_tags"] = [tag.strip() for tag in exclude_tags_input.split(",")]
|
|
|
|
# Performance
|
|
wait_for_input = self.query_one("#wait_for", Input).value
|
|
if wait_for_input.strip():
|
|
try:
|
|
options["wait_for"] = int(wait_for_input)
|
|
except ValueError:
|
|
pass
|
|
|
|
return options
|
|
|
|
def set_scrape_options(self, options: dict[str, object]) -> None:
|
|
"""Set form values from options."""
|
|
# Set formats
|
|
formats = options.get("formats", ["markdown"])
|
|
formats_list = formats if isinstance(formats, list) else []
|
|
self.query_one("#format_markdown", Checkbox).value = "markdown" in formats_list
|
|
self.query_one("#format_html", Checkbox).value = "html" in formats_list
|
|
self.query_one("#format_screenshot", Checkbox).value = "screenshot" in formats_list
|
|
|
|
# Set content filtering
|
|
main_content_val = options.get("only_main_content", True)
|
|
self.query_one("#only_main_content", Switch).value = bool(main_content_val)
|
|
|
|
if include_tags := options.get("include_tags", []):
|
|
include_list = include_tags if isinstance(include_tags, list) else []
|
|
self.query_one("#include_tags", Input).value = ", ".join(str(tag) for tag in include_list)
|
|
|
|
if exclude_tags := options.get("exclude_tags", []):
|
|
exclude_list = exclude_tags if isinstance(exclude_tags, list) else []
|
|
self.query_one("#exclude_tags", Input).value = ", ".join(str(tag) for tag in exclude_list)
|
|
|
|
# Set performance
|
|
wait_for = options.get("wait_for")
|
|
if wait_for is not None:
|
|
self.query_one("#wait_for", Input).value = str(wait_for)
|
|
|
|
|
|
class MapOptionsForm(Container):
|
|
"""Form for configuring site mapping options."""
|
|
|
|
DEFAULT_CSS = """
|
|
MapOptionsForm {
|
|
border: solid $border;
|
|
background: $surface;
|
|
padding: 1;
|
|
height: auto;
|
|
}
|
|
|
|
MapOptionsForm .form-section {
|
|
margin-bottom: 2;
|
|
padding: 1;
|
|
border: solid $border-lighten-1;
|
|
background: $surface-lighten-1;
|
|
}
|
|
|
|
MapOptionsForm .form-row {
|
|
layout: horizontal;
|
|
align-items: center;
|
|
height: auto;
|
|
margin-bottom: 1;
|
|
}
|
|
|
|
MapOptionsForm .form-label {
|
|
width: 30%;
|
|
min-width: 15;
|
|
text-align: right;
|
|
padding-right: 2;
|
|
}
|
|
|
|
MapOptionsForm .form-input {
|
|
width: 70%;
|
|
}
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
name: str | None = None,
|
|
id: str | None = None,
|
|
classes: str | None = None,
|
|
disabled: bool = False,
|
|
markup: bool = True,
|
|
) -> None:
|
|
"""Initialize map options form."""
|
|
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
|
|
|
|
@override
|
|
def compose(self) -> ComposeResult:
|
|
"""Compose map options form."""
|
|
yield Label("🗺️ Site Mapping Configuration", classes="form-title")
|
|
|
|
# Discovery settings section
|
|
yield Container(
|
|
Label("Discovery Settings", classes="section-title"),
|
|
Horizontal(
|
|
Label("Search Pattern:", classes="form-label"),
|
|
Input(
|
|
placeholder="docs, api, guide (optional)",
|
|
id="search_pattern",
|
|
classes="form-input",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
Horizontal(
|
|
Label("Include Subdomains:", classes="form-label"),
|
|
Switch(id="include_subdomains", value=False, classes="form-input"),
|
|
classes="form-row",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
# Limits section
|
|
yield Container(
|
|
Label("Crawling Limits", classes="section-title"),
|
|
Horizontal(
|
|
Label("Max Pages:", classes="form-label"),
|
|
Input(
|
|
placeholder="100",
|
|
id="max_pages",
|
|
validators=[Integer(minimum=1, maximum=1000)],
|
|
classes="form-input",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
Horizontal(
|
|
Label("Max Depth:", classes="form-label"),
|
|
Input(
|
|
placeholder="5",
|
|
id="max_depth",
|
|
validators=[Integer(minimum=1, maximum=20)],
|
|
classes="form-input",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
def get_map_options(self) -> dict[str, object]:
|
|
"""Get mapping options from form."""
|
|
options: dict[str, object] = {}
|
|
|
|
# Discovery settings
|
|
search_pattern = self.query_one("#search_pattern", Input).value
|
|
if search_pattern.strip():
|
|
options["search"] = search_pattern.strip()
|
|
|
|
options["include_subdomains"] = self.query_one("#include_subdomains", Switch).value
|
|
|
|
# Limits
|
|
max_pages_input = self.query_one("#max_pages", Input).value
|
|
if max_pages_input.strip():
|
|
try:
|
|
options["limit"] = int(max_pages_input)
|
|
except ValueError:
|
|
pass
|
|
|
|
max_depth_input = self.query_one("#max_depth", Input).value
|
|
if max_depth_input.strip():
|
|
try:
|
|
options["max_depth"] = int(max_depth_input)
|
|
except ValueError:
|
|
pass
|
|
|
|
return options
|
|
|
|
def set_map_options(self, options: dict[str, object]) -> None:
|
|
"""Set form values from options."""
|
|
if search := options.get("search"):
|
|
self.query_one("#search_pattern", Input).value = str(search)
|
|
|
|
subdomains_val = options.get("include_subdomains", False)
|
|
self.query_one("#include_subdomains", Switch).value = bool(subdomains_val)
|
|
|
|
# Set limits
|
|
limit = options.get("limit")
|
|
if limit is not None:
|
|
self.query_one("#max_pages", Input).value = str(limit)
|
|
|
|
max_depth = options.get("max_depth")
|
|
if max_depth is not None:
|
|
self.query_one("#max_depth", Input).value = str(max_depth)
|
|
|
|
|
|
class ExtractOptionsForm(Container):
|
|
"""Form for configuring data extraction options."""
|
|
|
|
DEFAULT_CSS = """
|
|
ExtractOptionsForm {
|
|
border: solid $border;
|
|
background: $surface;
|
|
padding: 1;
|
|
height: auto;
|
|
}
|
|
|
|
ExtractOptionsForm .form-section {
|
|
margin-bottom: 2;
|
|
padding: 1;
|
|
border: solid $border-lighten-1;
|
|
background: $surface-lighten-1;
|
|
}
|
|
|
|
ExtractOptionsForm .form-row {
|
|
layout: horizontal;
|
|
align-items: start;
|
|
height: auto;
|
|
margin-bottom: 1;
|
|
}
|
|
|
|
ExtractOptionsForm .form-label {
|
|
width: 30%;
|
|
min-width: 15;
|
|
text-align: right;
|
|
padding-right: 2;
|
|
padding-top: 1;
|
|
}
|
|
|
|
ExtractOptionsForm .form-input {
|
|
width: 70%;
|
|
}
|
|
|
|
ExtractOptionsForm .text-area {
|
|
height: 6;
|
|
}
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
name: str | None = None,
|
|
id: str | None = None,
|
|
classes: str | None = None,
|
|
disabled: bool = False,
|
|
markup: bool = True,
|
|
) -> None:
|
|
"""Initialize extract options form."""
|
|
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
|
|
|
|
@override
|
|
def compose(self) -> ComposeResult:
|
|
"""Compose extract options form."""
|
|
yield Label("🎯 Data Extraction Configuration", classes="form-title")
|
|
|
|
# Extraction prompt section
|
|
yield Container(
|
|
Label("AI-Powered Extraction", classes="section-title"),
|
|
Horizontal(
|
|
Label("Custom Prompt:", classes="form-label"),
|
|
TextArea(
|
|
placeholder="Extract product names, prices, and descriptions...",
|
|
id="extract_prompt",
|
|
classes="form-input text-area",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
# Schema definition section
|
|
yield Container(
|
|
Label("Structured Schema (JSON)", classes="section-title"),
|
|
Horizontal(
|
|
Label("Schema Definition:", classes="form-label"),
|
|
TextArea(
|
|
placeholder='{"product_name": "string", "price": "number", "description": "string"}',
|
|
id="extract_schema",
|
|
classes="form-input text-area",
|
|
),
|
|
classes="form-row",
|
|
),
|
|
Container(
|
|
Label("💡 Tip: Define the structure of data you want to extract"),
|
|
classes="help-text",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
# Schema presets
|
|
yield Container(
|
|
Label("Quick Presets", classes="section-title"),
|
|
Horizontal(
|
|
Button("📄 Article", id="preset_article", variant="default"),
|
|
Button("🛍️ Product", id="preset_product", variant="default"),
|
|
Button("👤 Contact", id="preset_contact", variant="default"),
|
|
Button("📊 Data", id="preset_data", variant="default"),
|
|
classes="preset-buttons",
|
|
),
|
|
classes="form-section",
|
|
)
|
|
|
|
def get_extract_options(self) -> dict[str, object]:
|
|
"""Get extraction options from form."""
|
|
options: dict[str, object] = {}
|
|
|
|
# Extract prompt
|
|
prompt = self.query_one("#extract_prompt", TextArea).text
|
|
if prompt.strip():
|
|
options["extract_prompt"] = prompt.strip()
|
|
|
|
# Extract schema
|
|
schema_text = self.query_one("#extract_schema", TextArea).text
|
|
if schema_text.strip():
|
|
try:
|
|
schema = json.loads(schema_text)
|
|
options["extract_schema"] = schema
|
|
except json.JSONDecodeError:
|
|
# Invalid JSON, skip schema
|
|
pass
|
|
|
|
return options
|
|
|
|
def set_extract_options(self, options: dict[str, object]) -> None:
|
|
"""Set form values from options."""
|
|
if prompt := options.get("extract_prompt"):
|
|
self.query_one("#extract_prompt", TextArea).text = str(prompt)
|
|
|
|
if schema := options.get("extract_schema"):
|
|
import json
|
|
|
|
self.query_one("#extract_schema", TextArea).text = json.dumps(schema, indent=2)
|
|
|
|
def on_button_pressed(self, event: Button.Pressed) -> None:
|
|
"""Handle preset button presses."""
|
|
schema_widget = self.query_one("#extract_schema", TextArea)
|
|
prompt_widget = self.query_one("#extract_prompt", TextArea)
|
|
|
|
if event.button.id == "preset_article":
|
|
schema_widget.text = """{
|
|
"title": "string",
|
|
"author": "string",
|
|
"date": "string",
|
|
"content": "string",
|
|
"tags": ["string"]
|
|
}"""
|
|
prompt_widget.text = "Extract article title, author, publication date, main content, and associated tags"
|
|
|
|
elif event.button.id == "preset_product":
|
|
schema_widget.text = """{
|
|
"name": "string",
|
|
"price": "number",
|
|
"description": "string",
|
|
"category": "string",
|
|
"availability": "string"
|
|
}"""
|
|
prompt_widget.text = "Extract product name, price, description, category, and availability status"
|
|
|
|
elif event.button.id == "preset_contact":
|
|
schema_widget.text = """{
|
|
"name": "string",
|
|
"email": "string",
|
|
"phone": "string",
|
|
"company": "string",
|
|
"position": "string"
|
|
}"""
|
|
prompt_widget.text = "Extract contact information including name, email, phone, company, and position"
|
|
|
|
elif event.button.id == "preset_data":
|
|
schema_widget.text = """{
|
|
"metrics": [{"name": "string", "value": "number", "unit": "string"}],
|
|
"tables": [{"headers": ["string"], "rows": [["string"]]}]
|
|
}"""
|
|
prompt_widget.text = "Extract numerical data, metrics, and tabular information"
|
|
|
|
|
|
class FirecrawlConfigWidget(Container):
|
|
"""Complete Firecrawl configuration widget with tabbed interface."""
|
|
|
|
DEFAULT_CSS = """
|
|
FirecrawlConfigWidget {
|
|
border: solid $border;
|
|
background: $surface;
|
|
height: 100%;
|
|
padding: 1;
|
|
}
|
|
|
|
FirecrawlConfigWidget .config-header {
|
|
dock: top;
|
|
height: 3;
|
|
background: $primary;
|
|
color: $text;
|
|
padding: 1;
|
|
margin: -1 -1 1 -1;
|
|
}
|
|
|
|
FirecrawlConfigWidget .tab-buttons {
|
|
dock: top;
|
|
height: 3;
|
|
layout: horizontal;
|
|
margin-bottom: 1;
|
|
}
|
|
|
|
FirecrawlConfigWidget .tab-button {
|
|
width: 1fr;
|
|
margin-right: 1;
|
|
}
|
|
|
|
FirecrawlConfigWidget .tab-content {
|
|
height: 1fr;
|
|
overflow: auto;
|
|
}
|
|
|
|
FirecrawlConfigWidget .actions {
|
|
dock: bottom;
|
|
height: 3;
|
|
layout: horizontal;
|
|
align: center;
|
|
margin-top: 1;
|
|
}
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
name: str | None = None,
|
|
id: str | None = None,
|
|
classes: str | None = None,
|
|
disabled: bool = False,
|
|
markup: bool = True,
|
|
) -> None:
|
|
"""Initialize Firecrawl config widget."""
|
|
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
|
|
self.current_tab = "scrape"
|
|
|
|
@override
|
|
def compose(self) -> ComposeResult:
|
|
"""Compose config widget layout."""
|
|
yield Container(
|
|
Label("🔥 Firecrawl Configuration", classes="config-header"),
|
|
Horizontal(
|
|
Button("🔧 Scraping", id="tab_scrape", variant="primary", classes="tab-button"),
|
|
Button("🗺️ Mapping", id="tab_map", variant="default", classes="tab-button"),
|
|
Button("🎯 Extraction", id="tab_extract", variant="default", classes="tab-button"),
|
|
classes="tab-buttons",
|
|
),
|
|
Container(
|
|
ScrapeOptionsForm(id="scrape_form"),
|
|
classes="tab-content",
|
|
),
|
|
Horizontal(
|
|
Button("📋 Load Preset", id="load_preset", variant="default"),
|
|
Button("💾 Save Preset", id="save_preset", variant="default"),
|
|
Button("🔄 Reset", id="reset_config", variant="default"),
|
|
classes="actions",
|
|
),
|
|
)
|
|
|
|
def on_mount(self) -> None:
|
|
"""Initialize widget."""
|
|
self.show_tab("scrape")
|
|
|
|
def show_tab(self, tab_name: str) -> None:
|
|
"""Show specific configuration tab."""
|
|
self.current_tab = tab_name
|
|
|
|
# Update button states
|
|
for tab in ["scrape", "map", "extract"]:
|
|
button = self.query_one(f"#tab_{tab}", Button)
|
|
button.variant = "primary" if tab == tab_name else "default"
|
|
# Update tab content
|
|
content_container = self.query_one(".tab-content", Container)
|
|
content_container.remove_children()
|
|
|
|
if tab_name == "extract":
|
|
content_container.mount(ExtractOptionsForm(id="extract_form"))
|
|
elif tab_name == "map":
|
|
content_container.mount(MapOptionsForm(id="map_form"))
|
|
elif tab_name == "scrape":
|
|
content_container.mount(ScrapeOptionsForm(id="scrape_form"))
|
|
|
|
def on_button_pressed(self, event: Button.Pressed) -> None:
|
|
"""Handle button presses."""
|
|
if event.button.id and event.button.id.startswith("tab_"):
|
|
tab_name = event.button.id[4:] # Remove "tab_" prefix
|
|
self.show_tab(tab_name)
|
|
|
|
def get_all_options(self) -> FirecrawlOptions:
|
|
"""Get all configuration options."""
|
|
options: FirecrawlOptions = {}
|
|
|
|
# Try to get options from currently mounted form
|
|
if self.current_tab == "scrape":
|
|
try:
|
|
form = self.query_one("#scrape_form", ScrapeOptionsForm)
|
|
scrape_opts = form.get_scrape_options()
|
|
options.update(cast(FirecrawlOptions, scrape_opts))
|
|
except Exception:
|
|
pass
|
|
elif self.current_tab == "map":
|
|
try:
|
|
map_form = self.query_one("#map_form", MapOptionsForm)
|
|
map_opts = map_form.get_map_options()
|
|
options.update(cast(FirecrawlOptions, map_opts))
|
|
except Exception:
|
|
pass
|
|
elif self.current_tab == "extract":
|
|
try:
|
|
extract_form = self.query_one("#extract_form", ExtractOptionsForm)
|
|
extract_opts = extract_form.get_extract_options()
|
|
options.update(cast(FirecrawlOptions, extract_opts))
|
|
except Exception:
|
|
pass
|
|
|
|
return options
|