import os
import sys
import requests
from urllib.parse import urlparse
import re

# --- Configuration ---
KNOWLEDGE_BASE_DIR = "knowledge_base"

# Ensure knowledge base directory exists
os.makedirs(KNOWLEDGE_BASE_DIR, exist_ok=True)

# --- Helper Function to fetch URL content ---
def fetch_url_content(url):
    try:
        # Use web_fetch tool if available, otherwise fallback to requests
        # For OpenClaw agent, 'web_fetch' is a tool. We cannot directly call tools from this script.
        # This script simulates a direct web_fetch using requests library.
        # In a real OpenClaw skill, this would be a tool call.
        
        # For simplicity and to demonstrate direct operation from script
        # without external OpenClaw tool calls (which would be in main agent logic),
        # we'll use requests.
        response = requests.get(url, timeout=10)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}", file=sys.stderr)
        return None

# --- Helper Function to fetch URL content ---
def fetch_content_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"KB_INGEST: Error fetching URL {url}: {e}", file=sys.stderr)
        return None

# --- Helper Function to save content to file ---
def save_content_to_file(content, url):
    if not content:
        print(f"KB_INGEST: No content to save for URL: {url}", file=sys.stderr)
        return False

    # Generate a filename from the URL
    parsed_url = urlparse(url)
    # Use hostname + path for filename, sanitize it
    filename = (parsed_url.netloc + parsed_url.path).replace('/', '_').replace('.', '_').replace(':', '_')
    # Limit filename length and ensure it ends with .md
    filename = re.sub(r'[^a-zA-Z0-9_-]', '', filename)[:100] + ".md"

    file_path = os.path.join(KNOWLEDGE_BASE_DIR, filename)
    
    try:
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"KB_INGEST: Content from {url} successfully saved to {file_path}", file=sys.stderr)
        return True
    except IOError as e:
        print(f"KB_INGEST: Error saving content to {file_path}: {e}", file=sys.stderr)
        return False

# --- Main Logic for URL Ingestion ---
def ingest_url(url):
    print(f"KB_INGEST: Ingesting content from URL: {url}", file=sys.stderr)
    content = fetch_content_from_url(url)
    if content:
        return save_content_to_file(content, url)
    return False

# --- Main entry point for the script ---
def main():
    if len(sys.argv) < 2:
        print("Usage: python kb_ingest.py <URL|FILE_PATH>", file=sys.stderr)
        sys.exit(1)
    
    input_arg = sys.argv[1]
    
    if input_arg.startswith("http://") or input_arg.startswith("https://"):
        ingest_url(input_arg)
    elif os.path.exists(input_arg): # Assuming it's a local file path provided by OpenClaw
        print(f"KB_INGEST: Ingesting local file: {input_arg}", file=sys.stderr)
        # For files, we might just copy them to the KB or extract text if it's a document
        # For initial iteration, let's just copy it over if it's a simple text file
        try:
            filename = os.path.basename(input_arg)
            destination_path = os.path.join(KNOWLEDGE_BASE_DIR, filename)
            # Simple copy for now, more complex handling (OCR, text extraction) later
            with open(input_arg, 'rb') as src, open(destination_path, 'wb') as dst:
                dst.write(src.read())
            print(f"KB_INGEST: File {input_arg} successfully copied to {destination_path}", file=sys.stderr)
        except Exception as e:
            print(f"KB_INGEST: Error processing local file {input_arg}: {e}", file=sys.stderr)
    else:
        print(f"KB_INGEST: Invalid input. Must be a URL or a local file path: {input_arg}", file=sys.stderr)
        sys.exit(1)

if __name__ == "__main__":
    main()
