import asyncio
import random
from urllib.parse import urlparse

# Camoufox is external package and needs to be installed. It is not included in crawlee.
from crawlee.crawlers import PlaywrightCrawler
from crawlee.crawlers import PlaywrightCrawlingContext

# Global ScraperAPI Key
SCRAPERAPI_KEY = "5f467f0e5f3ee632ec9451bb92c9f151"

def is_social_media_url(url: str) -> bool:
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    return any(
        social_domain in domain
        for social_domain in ["instagram.com", "linkedin.com", "facebook.com"]
    )

async def main(start_url: str) -> None:
    is_social = is_social_media_url(start_url)
    user_data_dir = "./user_data" if is_social else None
    
    # Configure PlaywrightCrawler directly without a BrowserPool or custom plugin
    crawler = PlaywrightCrawler(
        browser_type='firefox',  # Use the Camoufox (Firefox-based) browser
        headless=not is_social,
        persistent_context=is_social,
        user_data_dir=user_data_dir,
        max_requests_per_crawl=1,
    )

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext) -> None:
        original_target_url = context.request.url

        # The ScraperAPI Tunnel
        scraperapi_request_url = (
            f"http://api.scraperapi.com?api_key={SCRAPERAPI_KEY}&url={original_target_url}"
            "&render=true&country_code=us&ultra_stealth=true"
        )
        context.log.info(f'Processing {original_target_url} via ScraperAPI: {scraperapi_request_url}')

        await context.page.goto(scraperapi_request_url)

        # Rate Limiting for social media
        if is_social_media_url(original_target_url):
            context.log.info("Applying social media rate limit...")
            await asyncio.sleep(random.randint(5, 10))

        # Login/CAPTCHA detection
        page_content = await context.page.content()
        current_page_url = context.page.url
        login_keywords = ["Sign in", "Log in", "Verify you are human", "Robot Check", "CAPTCHA"]
        is_login_wall = any(keyword.lower() in page_content.lower() for keyword in login_keywords)
        is_login_redirect = "login" in current_page_url.lower()

        if is_login_wall or is_login_redirect:
            context.log.warning(f"Login wall or CAPTCHA detected for {original_target_url}.")
            if not (not is_social): # If headful
                context.log.warning("Please log in manually in the browser window. Waiting for 5 minutes...")
                await asyncio.sleep(300)
                context.log.info("Resuming scraping.")
        
        # Data Extraction
        extracted_data = {}
        if "linkedin.com" in original_target_url:
            context.log.info("Extracting LinkedIn data...")
            try:
                await context.page.wait_for_selector('.scaffold-layout__main', timeout=10000)
                extracted_data['title'] = await context.page.title()
                # More specific selectors for LinkedIn profile data would go here
            except Exception as e:
                extracted_data['error'] = str(e)
        
        await context.push_data({'url': original_target_url, 'data': extracted_data})

    await crawler.run([start_url])

if __name__ == '__main__':
    linkedin_url = "https://www.linkedin.com"
    asyncio.run(main(linkedin_url))
