feat(docs): add scaffolding for iceberg docs (#36888)

* feat(docs): add scaffolding for iceberg docs * docs:(storage/iceberg): Analytical buckets docs * Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Charis <26616127+charislam@users.noreply.github.com> * fix(docs): update analytics buckets docs * nav reorg * doc: analytics buckets limit section * fix ci errors --------- Co-authored-by: fenos <fabri.feno@gmail.com> Co-authored-by: Inian <inian1234@gmail.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
2025-07-15 10:16:23 -04:00
parent bd7d4f4360
commit 95750574d2
8 changed files with 296 additions and 0 deletions
--- a/apps/docs/components/Navigation/NavigationMenu/NavigationMenu.constants.ts
+++ b/apps/docs/components/Navigation/NavigationMenu/NavigationMenu.constants.ts
@@ -1717,6 +1717,25 @@ export const storage: NavMenuConstant = {
        { name: 'API Compatibility', url: '/guides/storage/s3/compatibility' },
      ],
    },
+    {
+      name: 'Analytics Buckets',
+      url: undefined,
+      items: [
+        { name: 'Introduction', url: '/guides/storage/analytics/introduction' },
+        {
+          name: 'Creating Analytics Buckets',
+          url: '/guides/storage/analytics/creating-analytics-buckets',
+        },
+        {
+          name: 'Connecting to Analytics Buckets',
+          url: '/guides/storage/analytics/connecting-to-analytics-bucket',
+        },
+        {
+          name: 'Limits',
+          url: '/guides/storage/analytics/limits',
+        },
+      ],
+    },
    {
      name: 'CDN',
      url: undefined,
--- a/apps/docs/content/guides/storage/analytics/connecting-to-analytics-bucket.mdx
+++ b/apps/docs/content/guides/storage/analytics/connecting-to-analytics-bucket.mdx
@@ -0,0 +1,187 @@
+---
+title: 'Connecting to Analytics Buckets'
+---
+
+<Admonition type="caution">
+
+This feature is in **Private Alpha**. API stability and backward compatibility are not guaranteed at this stage. Reach out from this [Form](https://forms.supabase.com/analytics-buckets) to request access
+
+</Admonition>
+
+When interacting with Analytics Buckets, you authenticate against two main services - the Iceberg REST Catalog and the S3-Compatible Storage Endpoint.
+
+The **Iceberg REST Catalog** acts as the central management system for Iceberg tables. It allows Iceberg clients, such as PyIceberg and Apache Spark, to perform metadata operations including:
+
+- Creating and managing tables and namespaces
+- Tracking schemas and handling schema evolution
+- Managing partitions and snapshots
+- Ensuring transactional consistency and isolation
+
+The REST Catalog itself does not store the actual data. Instead, it stores metadata describing the structure, schema, and partitioning strategy of Iceberg tables.
+
+Actual data storage and retrieval operations occur through the separate S3-compatible endpoint, optimized for reading and writing large analytical datasets stored in Parquet files.
+
+## Authentication
+
+To connect to an Analytics Bucket, you will need
+
+- An Iceberg client (Spark, PyIceberg, etc) which supports the REST Catalog interface.
+- S3 credentials to authenticate your Iceberg client with the underlying S3 Bucket.
+  To create S3 Credentials go to [**Project Settings > Storage**](https://supabase.com/dashboard/project/_/settings/storage), for more information, see the [S3 Authentication Guide](https://supabase.com/docs/guides/storage/s3/authentication). We will support other authentication methods in the future.
+
+- The project reference and Service key for your Supabase project.
+  You can find your Service key in the Supabase Dashboard under [**Project Settings > API**.](https://supabase.com/dashboard/project/_/settings/api-keys)
+
+You will now have an **Access Key** and a **Secret Key** that you can use to authenticate your Iceberg client.
+
+## Connecting via PyIceberg
+
+PyIceberg is a Python client for Apache Iceberg, facilitating interaction with Iceberg Buckets.
+
+**Installation**
+
+```bash
+pip install pyiceberg pyarrow
+```
+
+Here's a comprehensive example using PyIceberg with clearly separated configuration:
+
+```python
+from pyiceberg.catalog import load_catalog
+import pyarrow as pa
+import datetime
+
+# Supabase project ref
+PROJECT_REF = "<your-supabase-project-ref>"
+
+# Configuration for Iceberg REST Catalog
+WAREHOUSE = "your-analytics-bucket-name"
+TOKEN = "SERVICE_KEY"
+
+# Configuration for S3-Compatible Storage
+S3_ACCESS_KEY = "KEY"
+S3_SECRET_KEY = "SECRET"
+S3_REGION = "PROJECT_REGION"
+
+S3_ENDPOINT = f"https://{PROJECT_REF}.supabase.co/storage/v1/s3"
+CATALOG_URI = f"https://{PROJECT_REF}.supabase.co/storage/v1/iceberg"
+
+# Load the Iceberg catalog
+catalog = load_catalog(
+    "analytics-bucket",
+    type="rest",
+    warehouse=WAREHOUSE,
+    uri=CATALOG_URI,
+    token=TOKEN,
+    **{
+        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
+        "s3.endpoint": S3_ENDPOINT,
+        "s3.access-key-id": S3_ACCESS_KEY,
+        "s3.secret-access-key": S3_SECRET_KEY,
+        "s3.region": S3_REGION,
+        "s3.force-virtual-addressing": False,
+    },
+)
+
+# Create namespace if it doesn't exist
+catalog.create_namespace_if_not_exists("default")
+
+# Define schema for your Iceberg table
+schema = pa.schema([
+    pa.field("event_id", pa.int64()),
+    pa.field("event_name", pa.string()),
+    pa.field("event_timestamp", pa.timestamp("ms")),
+])
+
+# Create table (if it doesn't exist already)
+table = catalog.create_table_if_not_exists(("default", "events"), schema=schema)
+
+# Generate and insert sample data
+current_time = datetime.datetime.now()
+data = pa.table({
+    "event_id": [1, 2, 3],
+    "event_name": ["login", "logout", "purchase"],
+    "event_timestamp": [current_time, current_time, current_time],
+})
+
+# Append data to the Iceberg table
+table.append(data)
+
+# Scan table and print data as pandas DataFrame
+df = table.scan().to_pandas()
+print(df)
+```
+
+## Connecting via Apache Spark
+
+Apache Spark allows distributed analytical queries against Iceberg Buckets.
+
+```python
+from pyspark.sql import SparkSession
+
+# Supabase project ref
+PROJECT_REF = "<your-supabase-ref>"
+
+# Configuration for Iceberg REST Catalog
+WAREHOUSE = "your-analytics-bucket-name"
+TOKEN = "SERVICE_KEY"
+
+# Configuration for S3-Compatible Storage
+S3_ACCESS_KEY = "KEY"
+S3_SECRET_KEY = "SECRET"
+S3_REGION = "PROJECT_REGION"
+
+S3_ENDPOINT = f"https://{PROJECT_REF}.supabase.co/storage/v1/s3"
+CATALOG_URI = f"https://{PROJECT_REF}.supabase.co/storage/v1/iceberg"
+
+# Initialize Spark session with Iceberg configuration
+spark = SparkSession.builder \
+    .master("local[*]") \
+    .appName("SupabaseIceberg") \
+    .config("spark.driver.host", "127.0.0.1") \
+    .config("spark.driver.bindAddress", "127.0.0.1") \
+    .config('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1') \
+    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
+    .config("spark.sql.catalog.my_catalog", "org.apache.iceberg.spark.SparkCatalog") \
+    .config("spark.sql.catalog.my_catalog.type", "rest") \
+    .config("spark.sql.catalog.my_catalog.uri", CATALOG_URI) \
+    .config("spark.sql.catalog.my_catalog.warehouse", WAREHOUSE) \
+    .config("spark.sql.catalog.my_catalog.token", TOKEN) \
+    .config("spark.sql.catalog.my_catalog.s3.endpoint", S3_ENDPOINT) \
+    .config("spark.sql.catalog.my_catalog.s3.path-style-access", "true") \
+    .config("spark.sql.catalog.my_catalog.s3.access-key-id", S3_ACCESS_KEY) \
+    .config("spark.sql.catalog.my_catalog.s3.secret-access-key", S3_SECRET_KEY) \
+    .config("spark.sql.catalog.my_catalog.s3.remote-signing-enabled", "false") \
+    .config("spark.sql.defaultCatalog", "my_catalog") \
+    .getOrCreate()
+
+# SQL Operations
+spark.sql("CREATE NAMESPACE IF NOT EXISTS analytics")
+
+spark.sql("""
+    CREATE TABLE IF NOT EXISTS analytics.users (
+        user_id BIGINT,
+        username STRING
+    )
+    USING iceberg
+""")
+
+spark.sql("""
+    INSERT INTO analytics.users (user_id, username)
+    VALUES (1, 'Alice'), (2, 'Bob'), (3, 'Charlie')
+""")
+
+result_df = spark.sql("SELECT * FROM analytics.users")
+result_df.show()
+```
+
+## Connecting to the Iceberg REST Catalog directly
+
+To authenticate with the Iceberg REST Catalog directly, you need to provide a valid Supabase **Service key** as a Bearer token.
+
+```
+curl \
+  --request GET -sL \
+  --url 'https://<your-supabase-project>.supabase.co/storage/v1/iceberg/v1/config?warehouse=<bucket-name>' \
+  --header 'Authorization: Bearer <your-service-key>'
+```
--- a/apps/docs/content/guides/storage/analytics/creating-analytics-buckets.mdx
+++ b/apps/docs/content/guides/storage/analytics/creating-analytics-buckets.mdx
@@ -0,0 +1,38 @@
+---
+title: 'Creating Analytics Buckets'
+subtitle: ''
+---
+
+<Admonition type="caution">
+
+This feature is in **Private Alpha**. API stability and backward compatibility are not guaranteed at this stage. Reach out from this [Form](https://forms.supabase.com/analytics-buckets) to request access
+
+</Admonition>
+
+Analytics Buckets use [Apache Iceberg](https://iceberg.apache.org/), an open-table format for managing large analytical datasets.
+You can interact with them using tools such as [PyIceberg](https://py.iceberg.apache.org/), [Apache Spark](https://spark.apache.org/) or any client which supports the [standard Iceberg REST Catalog API](https://editor-next.swagger.io/?url=https://raw.githubusercontent.com/apache/iceberg/main/open-api/rest-catalog-open-api.yaml).
+
+You can create an Analytics Bucket using either the Supabase SDK or the Supabase Dashboard.
+
+### Using the Supabase SDK
+
+```ts
+import { createClient } from '@supabase/supabase-js'
+
+const supabase = createClient('https://your-project.supabase.co', 'your-service-key')
+
+supabase.storage.createBucket('my-analytics-bucket', {
+  type: 'ANALYTICS',
+})
+```
+
+### Using the Supabase Dashboard
+
+1. Navigate to the Storage section in the Supabase Dashboard.
+2. Click on "Create Bucket".
+3. Enter a name for your bucket (e.g., my-analytics-bucket).
+4. Select "Analytics Bucket" as the bucket type.
+
+<img alt="Storage schema design" src="/docs/img/storage/iceberg-bucket.png" />
+
+Now, that you have created your Analytics Bucket, you can start [connecting to it](/docs/guides/storage/analytics/connecting-to-analytics-bucket) with Iceberg clients like PyIceberg or Apache Spark.
--- a/apps/docs/content/guides/storage/analytics/introduction.mdx
+++ b/apps/docs/content/guides/storage/analytics/introduction.mdx
@@ -0,0 +1,24 @@
+---
+title: 'Analytics Buckets'
+subtitle: ''
+---
+
+<Admonition type="caution">
+
+This feature is in **Private Alpha**. API stability and backward compatibility are not guaranteed at this stage. Reach out from this [Form](https://forms.supabase.com/analytics-buckets) to request access
+
+</Admonition>
+
+**Analytics Buckets** are designed for analytical workflows on large datasets without impacting your main database.
+
+Postgres tables are optimized for handling real-time, transactional workloads with frequent inserts, updates, deletes and low-latency queries. **Analytical workloads** have very different requirements: processing large volumes of historical data, running complex queries and aggregations, minimizing storage costs, and ensuring these analytical queries do not interfere with the production traffic.
+
+**Analytics Buckets** address these requirements using [Apache Iceberg](https://iceberg.apache.org/), an open-table format for managing large analytical datasets efficiently.
+
+Analytics Buckets are ideal for
+• Data warehousing and business intelligence
+• Historical data archiving
+• Periodically refreshed real-time analytics
+• Complex analytical queries over large datasets
+
+By separating transactional and analytical workloads, Supabase makes it easy to build scalable analytics pipelines without impacting your primary Postgres performance.
--- a/apps/docs/content/guides/storage/analytics/limits.mdx
+++ b/apps/docs/content/guides/storage/analytics/limits.mdx
@@ -0,0 +1,23 @@
+---
+title: 'Analytics Buckets Limits'
+subtitle: ''
+---
+
+<Admonition type="caution">
+
+This feature is in **Private Alpha**. API stability and backward compatibility are not guaranteed at this stage. Reach out from this [Form](https://forms.supabase.com/analytics-buckets) to request access
+
+</Admonition>
+
+The following default limits are applied when this feature is in the private alpha stage, they can be adjusted on a case-by-case basis:
+
+| **Category**                            | **Limit** |
+| --------------------------------------- | --------- |
+| Number of Analytics Buckets per project | 2         |
+| Number of namespaces per bucket         | 10        |
+| Number of tables per namespace          | 10        |
+
+## Pricing
+
+Analytics Buckets are Free to use during the Private Alpha phase,
+however, you'll still be charged for the underlying egress.
--- a/apps/docs/public/img/storage/iceberg-bucket.png
+++ b/apps/docs/public/img/storage/iceberg-bucket.png
--- a/supa-mdx-lint/Rule001HeadingCase.toml
+++ b/supa-mdx-lint/Rule001HeadingCase.toml
@@ -10,6 +10,7 @@ may_uppercase = [
    "Analytics",
    "Android",
    "Angular",
+    "Apache Spark",
    "Apple",
    "Assistant",
    "Audit Logs?",
@@ -25,6 +26,7 @@ may_uppercase = [
    "Branching",
    "Broadcast",
    "CAPTCHA",
+    "Catalog",
    "Channel",
    "ChatGPT",
    "Chrome",
@@ -95,6 +97,7 @@ may_uppercase = [
    "IPv4",
    "IPv6",
    "IVFFlat",
+    "Iceberg",
    "IdP",
    "Inbucket",
    "Index Advisor",
@@ -150,6 +153,7 @@ may_uppercase = [
    "Prisma",
    "PrivateLink",
    "Prometheus",
+    "PyIceberg",
    "Python",
 	"Qodo Gen",
    "Queues?",
--- a/supa-mdx-lint/Rule003Spelling.toml
+++ b/supa-mdx-lint/Rule003Spelling.toml
@@ -246,6 +246,7 @@ allow_list = [
    "PubSub",
    "Prisma",
    "PrivateLink",
+    "PyIceberg",
    "Qodo",
    "README",
    "Redis",