// src/lib/crawlJob.ts
// Algo Feed: uses YouTube Data API v3 (videos.list?chart=mostPopular)
// across 6 categories × 4 regions = up to 240 trending videos per crawl.
// Results are TF-IDF clustered, then named by 1 Gemini call.
// Falls back to timestamp-based names if Gemini fails.

import pool from './db';
import { clusterTitles } from './cluster';
import { geminiPrompt } from './gemini';
import { ytGet } from './youtubeApi';
import './discoverDb';

const REGIONS = ['US', 'GB', 'CA', 'AU'];

// ALL meaningful YouTube video category IDs
const CATEGORY_IDS = [
    '0',  // All categories (no filter)
    '1',  // Film & Animation
    '2',  // Autos & Vehicles
    '10', // Music
    '15', // Pets & Animals
    '17', // Sports
    '19', // Travel & Events
    '20', // Gaming
    '22', // People & Blogs
    '23', // Comedy
    '24', // Entertainment
    '25', // News & Politics
    '26', // Howto & Style
    '27', // Education
    '28', // Science & Technology
    '29', // Nonprofits & Activism
];

/** Parse ISO 8601 duration string → total seconds (e.g. PT1M30S → 90) */
function parseDurationSecs(iso: string): number {
    if (!iso) return 0;
    const m = iso.match(/PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?/);
    if (!m) return 0;
    return (parseInt(m[1] || '0') * 3600) + (parseInt(m[2] || '0') * 60) + parseInt(m[3] || '0');
}


interface VideoCard {
    title: string;
    channel: string;
    channelId: string;
    views: string;
    videoId: string;
    thumbnail: string;
    durationSecs: number;
    sessionId: string;
    source: string;
    categoryId: string;
    region: string;
}

/** Fetch trending Shorts (≤60s) for one region + category combination */
async function fetchTrendingBatch(region: string, categoryId: string): Promise<VideoCard[]> {
    const sessionId = `${region}_${categoryId}`;
    try {
        const params: Record<string, string | number> = {
            part: 'snippet,statistics,contentDetails',
            chart: 'mostPopular',
            regionCode: region,
            maxResults: 50,
            hl: 'en',
        };
        if (categoryId !== '0') params.videoCategoryId = categoryId;

        const data = await ytGet('/videos', params);
        const items: any[] = data.items || [];

        return items
            .map((item: any) => ({
                title: item.snippet?.title || '',
                channel: item.snippet?.channelTitle || '',
                channelId: item.snippet?.channelId || '',
                views: item.statistics?.viewCount || '0',
                videoId: item.id,
                thumbnail: item.snippet?.thumbnails?.medium?.url || item.snippet?.thumbnails?.default?.url || '',
                durationSecs: parseDurationSecs(item.contentDetails?.duration || ''),
                sessionId,
                source: `api_${region}_cat${categoryId}`,
                categoryId,
                region,
            }))
            .filter(v => v.title.length > 5 && v.durationSecs > 0 && v.durationSecs <= 60);

    } catch (err: any) {
        console.warn(`[CrawlJob] API fetch failed for ${region}/${categoryId}: ${err.message}`);
        return [];
    }
}

/** Check if a new crawl is needed (returns true if >6 hours since last crawl) */
export async function shouldCrawl(): Promise<boolean> {
    try {
        const [rows] = await pool.execute<any[]>(
            `SELECT MAX(crawled_at) AS last FROM algo_feed_snapshots`
        );
        const last = rows[0]?.last as Date | null;
        if (!last) return true;
        const ageHours = (Date.now() - new Date(last).getTime()) / (1000 * 60 * 60);
        return ageHours >= 6; // Hardcoded fallback since cron is removed
    } catch {
        return true;
    }
}

/** Run the full crawl pipeline and return summary */
export async function runCrawlJob(): Promise<{ crawled: number; clusters: number; error?: string }> {

    console.log(`[CrawlJob] Starting Shorts-only trending crawl via YouTube API…`);

    // Clean up old snapshots
    try {
        await pool.execute(`DELETE FROM algo_feed_snapshots WHERE crawled_at < NOW() - INTERVAL 12 HOUR`);
    } catch (e) {
        console.warn('[CrawlJob] Cleanup skipped (table may not exist yet)');
    }

    // Fetch all Region × Category combos in parallel (batches of 8 to respect rate limits)
    const combos: [string, string][] = [];
    for (const region of REGIONS) {
        for (const catId of CATEGORY_IDS) {
            combos.push([region, catId]);
        }
    }

    const allCards: VideoCard[] = [];
    const seenVideoIds = new Set<string>();

    // Process in parallel batches of 12
    for (let i = 0; i < combos.length; i += 12) {
        const batch = combos.slice(i, i + 12);
        const results = await Promise.all(batch.map(([r, c]) => fetchTrendingBatch(r, c)));
        for (const cards of results) {
            for (const card of cards) {
                if (!seenVideoIds.has(card.videoId)) {
                    seenVideoIds.add(card.videoId);
                    allCards.push(card);
                }
            }
        }
        // Small pause between batches to avoid quota issues
        if (i + 12 < combos.length) await new Promise(r => setTimeout(r, 500));
    }

    console.log(`[CrawlJob] Fetched ${allCards.length} unique Shorts (≤60s) across ${combos.length} region/category combos`);

    if (allCards.length === 0) {
        console.error('[CrawlJob] No videos fetched — check YOUTUBE_API_KEY and quota.');
        return { crawled: 0, clusters: 0 };
    }

    // Store snapshots in DB (include channelId in source field for later linking)
    for (const c of allCards) {
        try {
            await pool.execute(
                `INSERT INTO algo_feed_snapshots (session_id, title, channel, views, video_id, thumbnail, source)
                 VALUES (?, ?, ?, ?, ?, ?, ?)`,
                [c.sessionId, c.title, c.channel, c.views, c.videoId, c.thumbnail, `${c.source}|ch:${c.channelId}`]
            );
        } catch { /* skip duplicates */ }
    }

    // TF-IDF cluster — use sessionId so "appears in multiple region/category sessions" = higher score
    const clusters = clusterTitles(
        allCards.map(c => ({ title: c.title, channel: c.channel, sessionId: c.sessionId })),
        2
    );

    console.log(`[CrawlJob] Found ${clusters.length} raw clusters`);

    // Timestamp for fallback names
    const crawlTime = new Date().toLocaleString('en-US', {
        month: 'short', day: 'numeric', hour: '2-digit', minute: '2-digit', hour12: true,
    });

    // Batch name clusters with 1 Gemini call
    let namedClusters = clusters;
    if (clusters.length > 0) {
        try {
            const prompt = `You are a YouTube niche analyst. For each topic cluster below, generate a short punchy niche name (3-5 words max). Respond ONLY with a JSON array of strings in the SAME ORDER.

Clusters:
${clusters.map((c, i) => `${i + 1}. Keywords: [${c.keywords.slice(0, 5).join(', ')}] — Samples: "${c.titles.slice(0, 2).join('" / "')}"`).join('\n')}

Respond with ONLY: ["Name 1", "Name 2", ...]`;

            const raw = await geminiPrompt(prompt);
            const match = raw.match(/\[[\s\S]*\]/);
            if (match) {
                const names: string[] = JSON.parse(match[0]);
                namedClusters = clusters.map((c, i) => ({
                    ...c,
                    name: (names[i] && names[i].trim()) ? names[i] : `${c.keywords[0] || 'Trend'} · ${crawlTime}`,
                }));
            } else {
                namedClusters = clusters.map(c => ({ ...c, name: `${c.keywords[0] || 'Trend'} · ${crawlTime}` }));
            }
        } catch (e) {
            console.warn('[CrawlJob] Gemini naming failed — using timestamp fallback:', e);
            namedClusters = clusters.map(c => ({ ...c, name: `${c.keywords[0] || 'Trend'} · ${crawlTime}` }));
        }
    }

    // Count unique sessions (region/category combos) each cluster appears in
    const sessionSetPerCluster = namedClusters.map(cluster => {
        const sessions = new Set(
            allCards
                .filter(c => cluster.titles.includes(c.title))
                .map(c => c.sessionId)
        );
        return sessions.size;
    });

    // Save clusters to discovered_niches
    try {
        await pool.execute(`DELETE FROM discovered_niches WHERE source = 'algo_feed'`);
    } catch { /* ignore */ }

    for (let i = 0; i < namedClusters.length; i++) {
        const cluster = namedClusters[i];
        const sessionCount = sessionSetPerCluster[i];
        const finalScore = cluster.score + (sessionCount * 15);
        try {
            await pool.execute(
                `INSERT INTO discovered_niches (name, keywords, source, session_count, item_count, score, sample_titles, sample_channels)
                 VALUES (?, ?, 'algo_feed', ?, ?, ?, ?, ?)`,
                [
                    cluster.name,
                    cluster.keywords.join(', '),
                    sessionCount,
                    cluster.titles.length,
                    finalScore,
                    cluster.titles.slice(0, 4).join(' | '),
                    cluster.channels.slice(0, 4).join(', '),
                ]
            );
        } catch (e) {
            console.warn('[CrawlJob] Insert cluster failed:', e);
        }

        // Feature 1: Record score history for sparklines
        try {
            await pool.execute(
                `INSERT INTO niche_score_history (niche_name, score, session_count, item_count)
                 VALUES (?, ?, ?, ?)`,
                [cluster.name, finalScore, sessionCount, cluster.titles.length]
            );
        } catch (e) {
            console.warn('[CrawlJob] Insert history failed:', e);
        }
    }

    // Prune history older than 30 days
    try {
        await pool.execute(`DELETE FROM niche_score_history WHERE crawled_at < NOW() - INTERVAL 30 DAY`);
    } catch { /* ignore */ }

    console.log(`[CrawlJob] Done. ${allCards.length} Shorts → ${namedClusters.length} clusters stored.`);
    return { crawled: allCards.length, clusters: namedClusters.length };
}

