Files
rag-manager/ingest_pipeline/cli/tui/widgets/firecrawl_config.py
2025-09-19 06:56:19 +00:00

674 lines
22 KiB
Python

"""Firecrawl configuration widgets for advanced scraping options."""
from __future__ import annotations
import json
from typing import cast
from textual.app import ComposeResult
from textual.containers import Container, Horizontal
from textual.validation import Integer
from textual.widgets import Button, Checkbox, Input, Label, Switch, TextArea
from typing_extensions import override
from ..models import FirecrawlOptions
class ScrapeOptionsForm(Container):
"""Form for configuring Firecrawl scraping options."""
DEFAULT_CSS = """
ScrapeOptionsForm {
border: solid $border;
background: $surface;
padding: 1;
height: auto;
}
ScrapeOptionsForm .form-section {
margin-bottom: 2;
padding: 1;
border: solid $border-lighten-1;
background: $surface-lighten-1;
}
ScrapeOptionsForm .form-row {
layout: horizontal;
align-items: center;
height: auto;
margin-bottom: 1;
}
ScrapeOptionsForm .form-label {
width: 30%;
min-width: 15;
text-align: right;
padding-right: 2;
}
ScrapeOptionsForm .form-input {
width: 70%;
}
ScrapeOptionsForm .checkbox-row {
layout: horizontal;
align-items: center;
height: 3;
margin-bottom: 1;
}
ScrapeOptionsForm .checkbox-label {
margin-left: 2;
}
"""
def __init__(
self,
*,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize scrape options form."""
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
@override
def compose(self) -> ComposeResult:
"""Compose scrape options form."""
yield Label("🔧 Scraping Configuration", classes="form-title")
# Output formats section
yield Container(
Label("Output Formats", classes="section-title"),
Horizontal(
Checkbox("Markdown", id="format_markdown", value=True, classes="checkbox"),
Label("Markdown", classes="checkbox-label"),
classes="checkbox-row",
),
Horizontal(
Checkbox("HTML", id="format_html", value=False, classes="checkbox"),
Label("HTML", classes="checkbox-label"),
classes="checkbox-row",
),
Horizontal(
Checkbox("Screenshot", id="format_screenshot", value=False, classes="checkbox"),
Label("Screenshot", classes="checkbox-label"),
classes="checkbox-row",
),
classes="form-section",
)
# Content filtering section
yield Container(
Label("Content Filtering", classes="section-title"),
Horizontal(
Label("Only Main Content:", classes="form-label"),
Switch(id="only_main_content", value=True, classes="form-input"),
classes="form-row",
),
Horizontal(
Label("Include Tags:", classes="form-label"),
Input(
placeholder="p, div, article (comma-separated)",
id="include_tags",
classes="form-input",
),
classes="form-row",
),
Horizontal(
Label("Exclude Tags:", classes="form-label"),
Input(
placeholder="nav, footer, script (comma-separated)",
id="exclude_tags",
classes="form-input",
),
classes="form-row",
),
classes="form-section",
)
# Performance settings section
yield Container(
Label("Performance Settings", classes="section-title"),
Horizontal(
Label("Wait Time (ms):", classes="form-label"),
Input(
placeholder="0",
id="wait_for",
validators=[Integer(minimum=0, maximum=30000)],
classes="form-input",
),
classes="form-row",
),
classes="form-section",
)
def get_scrape_options(self) -> dict[str, object]:
"""Get scraping options from form."""
# Collect formats
formats = []
if self.query_one("#format_markdown", Checkbox).value:
formats.append("markdown")
if self.query_one("#format_html", Checkbox).value:
formats.append("html")
if self.query_one("#format_screenshot", Checkbox).value:
formats.append("screenshot")
options: dict[str, object] = {
"formats": formats,
"only_main_content": self.query_one(
"#only_main_content", Switch
).value,
}
include_tags_input = self.query_one("#include_tags", Input).value
if include_tags_input.strip():
options["include_tags"] = [tag.strip() for tag in include_tags_input.split(",")]
exclude_tags_input = self.query_one("#exclude_tags", Input).value
if exclude_tags_input.strip():
options["exclude_tags"] = [tag.strip() for tag in exclude_tags_input.split(",")]
# Performance
wait_for_input = self.query_one("#wait_for", Input).value
if wait_for_input.strip():
try:
options["wait_for"] = int(wait_for_input)
except ValueError:
pass
return options
def set_scrape_options(self, options: dict[str, object]) -> None:
"""Set form values from options."""
# Set formats
formats = options.get("formats", ["markdown"])
formats_list = formats if isinstance(formats, list) else []
self.query_one("#format_markdown", Checkbox).value = "markdown" in formats_list
self.query_one("#format_html", Checkbox).value = "html" in formats_list
self.query_one("#format_screenshot", Checkbox).value = "screenshot" in formats_list
# Set content filtering
main_content_val = options.get("only_main_content", True)
self.query_one("#only_main_content", Switch).value = bool(main_content_val)
if include_tags := options.get("include_tags", []):
include_list = include_tags if isinstance(include_tags, list) else []
self.query_one("#include_tags", Input).value = ", ".join(str(tag) for tag in include_list)
if exclude_tags := options.get("exclude_tags", []):
exclude_list = exclude_tags if isinstance(exclude_tags, list) else []
self.query_one("#exclude_tags", Input).value = ", ".join(str(tag) for tag in exclude_list)
# Set performance
wait_for = options.get("wait_for")
if wait_for is not None:
self.query_one("#wait_for", Input).value = str(wait_for)
class MapOptionsForm(Container):
"""Form for configuring site mapping options."""
DEFAULT_CSS = """
MapOptionsForm {
border: solid $border;
background: $surface;
padding: 1;
height: auto;
}
MapOptionsForm .form-section {
margin-bottom: 2;
padding: 1;
border: solid $border-lighten-1;
background: $surface-lighten-1;
}
MapOptionsForm .form-row {
layout: horizontal;
align-items: center;
height: auto;
margin-bottom: 1;
}
MapOptionsForm .form-label {
width: 30%;
min-width: 15;
text-align: right;
padding-right: 2;
}
MapOptionsForm .form-input {
width: 70%;
}
"""
def __init__(
self,
*,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize map options form."""
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
@override
def compose(self) -> ComposeResult:
"""Compose map options form."""
yield Label("🗺️ Site Mapping Configuration", classes="form-title")
# Discovery settings section
yield Container(
Label("Discovery Settings", classes="section-title"),
Horizontal(
Label("Search Pattern:", classes="form-label"),
Input(
placeholder="docs, api, guide (optional)",
id="search_pattern",
classes="form-input",
),
classes="form-row",
),
Horizontal(
Label("Include Subdomains:", classes="form-label"),
Switch(id="include_subdomains", value=False, classes="form-input"),
classes="form-row",
),
classes="form-section",
)
# Limits section
yield Container(
Label("Crawling Limits", classes="section-title"),
Horizontal(
Label("Max Pages:", classes="form-label"),
Input(
placeholder="100",
id="max_pages",
validators=[Integer(minimum=1, maximum=1000)],
classes="form-input",
),
classes="form-row",
),
Horizontal(
Label("Max Depth:", classes="form-label"),
Input(
placeholder="5",
id="max_depth",
validators=[Integer(minimum=1, maximum=20)],
classes="form-input",
),
classes="form-row",
),
classes="form-section",
)
def get_map_options(self) -> dict[str, object]:
"""Get mapping options from form."""
options: dict[str, object] = {}
# Discovery settings
search_pattern = self.query_one("#search_pattern", Input).value
if search_pattern.strip():
options["search"] = search_pattern.strip()
options["include_subdomains"] = self.query_one("#include_subdomains", Switch).value
# Limits
max_pages_input = self.query_one("#max_pages", Input).value
if max_pages_input.strip():
try:
options["limit"] = int(max_pages_input)
except ValueError:
pass
max_depth_input = self.query_one("#max_depth", Input).value
if max_depth_input.strip():
try:
options["max_depth"] = int(max_depth_input)
except ValueError:
pass
return options
def set_map_options(self, options: dict[str, object]) -> None:
"""Set form values from options."""
if search := options.get("search"):
self.query_one("#search_pattern", Input).value = str(search)
subdomains_val = options.get("include_subdomains", False)
self.query_one("#include_subdomains", Switch).value = bool(subdomains_val)
# Set limits
limit = options.get("limit")
if limit is not None:
self.query_one("#max_pages", Input).value = str(limit)
max_depth = options.get("max_depth")
if max_depth is not None:
self.query_one("#max_depth", Input).value = str(max_depth)
class ExtractOptionsForm(Container):
"""Form for configuring data extraction options."""
DEFAULT_CSS = """
ExtractOptionsForm {
border: solid $border;
background: $surface;
padding: 1;
height: auto;
}
ExtractOptionsForm .form-section {
margin-bottom: 2;
padding: 1;
border: solid $border-lighten-1;
background: $surface-lighten-1;
}
ExtractOptionsForm .form-row {
layout: horizontal;
align-items: start;
height: auto;
margin-bottom: 1;
}
ExtractOptionsForm .form-label {
width: 30%;
min-width: 15;
text-align: right;
padding-right: 2;
padding-top: 1;
}
ExtractOptionsForm .form-input {
width: 70%;
}
ExtractOptionsForm .text-area {
height: 6;
}
"""
def __init__(
self,
*,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize extract options form."""
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
@override
def compose(self) -> ComposeResult:
"""Compose extract options form."""
yield Label("🎯 Data Extraction Configuration", classes="form-title")
# Extraction prompt section
yield Container(
Label("AI-Powered Extraction", classes="section-title"),
Horizontal(
Label("Custom Prompt:", classes="form-label"),
TextArea(
placeholder="Extract product names, prices, and descriptions...",
id="extract_prompt",
classes="form-input text-area",
),
classes="form-row",
),
classes="form-section",
)
# Schema definition section
yield Container(
Label("Structured Schema (JSON)", classes="section-title"),
Horizontal(
Label("Schema Definition:", classes="form-label"),
TextArea(
placeholder='{"product_name": "string", "price": "number", "description": "string"}',
id="extract_schema",
classes="form-input text-area",
),
classes="form-row",
),
Container(
Label("💡 Tip: Define the structure of data you want to extract"),
classes="help-text",
),
classes="form-section",
)
# Schema presets
yield Container(
Label("Quick Presets", classes="section-title"),
Horizontal(
Button("📄 Article", id="preset_article", variant="default"),
Button("🛍️ Product", id="preset_product", variant="default"),
Button("👤 Contact", id="preset_contact", variant="default"),
Button("📊 Data", id="preset_data", variant="default"),
classes="preset-buttons",
),
classes="form-section",
)
def get_extract_options(self) -> dict[str, object]:
"""Get extraction options from form."""
options: dict[str, object] = {}
# Extract prompt
prompt = self.query_one("#extract_prompt", TextArea).text
if prompt.strip():
options["extract_prompt"] = prompt.strip()
# Extract schema
schema_text = self.query_one("#extract_schema", TextArea).text
if schema_text.strip():
try:
schema = json.loads(schema_text)
options["extract_schema"] = schema
except json.JSONDecodeError:
# Invalid JSON, skip schema
pass
return options
def set_extract_options(self, options: dict[str, object]) -> None:
"""Set form values from options."""
if prompt := options.get("extract_prompt"):
self.query_one("#extract_prompt", TextArea).text = str(prompt)
if schema := options.get("extract_schema"):
import json
self.query_one("#extract_schema", TextArea).text = json.dumps(schema, indent=2)
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle preset button presses."""
schema_widget = self.query_one("#extract_schema", TextArea)
prompt_widget = self.query_one("#extract_prompt", TextArea)
if event.button.id == "preset_article":
schema_widget.text = """{
"title": "string",
"author": "string",
"date": "string",
"content": "string",
"tags": ["string"]
}"""
prompt_widget.text = "Extract article title, author, publication date, main content, and associated tags"
elif event.button.id == "preset_product":
schema_widget.text = """{
"name": "string",
"price": "number",
"description": "string",
"category": "string",
"availability": "string"
}"""
prompt_widget.text = "Extract product name, price, description, category, and availability status"
elif event.button.id == "preset_contact":
schema_widget.text = """{
"name": "string",
"email": "string",
"phone": "string",
"company": "string",
"position": "string"
}"""
prompt_widget.text = "Extract contact information including name, email, phone, company, and position"
elif event.button.id == "preset_data":
schema_widget.text = """{
"metrics": [{"name": "string", "value": "number", "unit": "string"}],
"tables": [{"headers": ["string"], "rows": [["string"]]}]
}"""
prompt_widget.text = "Extract numerical data, metrics, and tabular information"
class FirecrawlConfigWidget(Container):
"""Complete Firecrawl configuration widget with tabbed interface."""
DEFAULT_CSS = """
FirecrawlConfigWidget {
border: solid $border;
background: $surface;
height: 100%;
padding: 1;
}
FirecrawlConfigWidget .config-header {
dock: top;
height: 3;
background: $primary;
color: $text;
padding: 1;
margin: -1 -1 1 -1;
}
FirecrawlConfigWidget .tab-buttons {
dock: top;
height: 3;
layout: horizontal;
margin-bottom: 1;
}
FirecrawlConfigWidget .tab-button {
width: 1fr;
margin-right: 1;
}
FirecrawlConfigWidget .tab-content {
height: 1fr;
overflow: auto;
}
FirecrawlConfigWidget .actions {
dock: bottom;
height: 3;
layout: horizontal;
align: center;
margin-top: 1;
}
"""
def __init__(
self,
*,
name: str | None = None,
id: str | None = None,
classes: str | None = None,
disabled: bool = False,
markup: bool = True,
) -> None:
"""Initialize Firecrawl config widget."""
super().__init__(name=name, id=id, classes=classes, disabled=disabled, markup=markup)
self.current_tab = "scrape"
@override
def compose(self) -> ComposeResult:
"""Compose config widget layout."""
yield Container(
Label("🔥 Firecrawl Configuration", classes="config-header"),
Horizontal(
Button("🔧 Scraping", id="tab_scrape", variant="primary", classes="tab-button"),
Button("🗺️ Mapping", id="tab_map", variant="default", classes="tab-button"),
Button("🎯 Extraction", id="tab_extract", variant="default", classes="tab-button"),
classes="tab-buttons",
),
Container(
ScrapeOptionsForm(id="scrape_form"),
classes="tab-content",
),
Horizontal(
Button("📋 Load Preset", id="load_preset", variant="default"),
Button("💾 Save Preset", id="save_preset", variant="default"),
Button("🔄 Reset", id="reset_config", variant="default"),
classes="actions",
),
)
def on_mount(self) -> None:
"""Initialize widget."""
self.show_tab("scrape")
def show_tab(self, tab_name: str) -> None:
"""Show specific configuration tab."""
self.current_tab = tab_name
# Update button states
for tab in ["scrape", "map", "extract"]:
button = self.query_one(f"#tab_{tab}", Button)
button.variant = "primary" if tab == tab_name else "default"
# Update tab content
content_container = self.query_one(".tab-content", Container)
content_container.remove_children()
if tab_name == "extract":
content_container.mount(ExtractOptionsForm(id="extract_form"))
elif tab_name == "map":
content_container.mount(MapOptionsForm(id="map_form"))
elif tab_name == "scrape":
content_container.mount(ScrapeOptionsForm(id="scrape_form"))
def on_button_pressed(self, event: Button.Pressed) -> None:
"""Handle button presses."""
if event.button.id and event.button.id.startswith("tab_"):
tab_name = event.button.id[4:] # Remove "tab_" prefix
self.show_tab(tab_name)
def get_all_options(self) -> FirecrawlOptions:
"""Get all configuration options."""
options: FirecrawlOptions = {}
# Try to get options from currently mounted form
if self.current_tab == "scrape":
try:
form = self.query_one("#scrape_form", ScrapeOptionsForm)
scrape_opts = form.get_scrape_options()
options.update(cast(FirecrawlOptions, scrape_opts))
except Exception:
pass
elif self.current_tab == "map":
try:
map_form = self.query_one("#map_form", MapOptionsForm)
map_opts = map_form.get_map_options()
options.update(cast(FirecrawlOptions, map_opts))
except Exception:
pass
elif self.current_tab == "extract":
try:
extract_form = self.query_one("#extract_form", ExtractOptionsForm)
extract_opts = extract_form.get_extract_options()
options.update(cast(FirecrawlOptions, extract_opts))
except Exception:
pass
return options