<?php
// Prevent script timeout for long crawling sessions
set_time_limit(0);
// Ensure we see progress in the browser as it happens
ob_implicit_flush(true);
if (ob_get_level()) ob_end_clean();

$downloadDir = 'files';
$logDir = 'log';
$maxDepth = 5;
$visitedUrls = [];
$mediaExtensions = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'mp4'];
$webExtensions = ['html', 'htm', 'php', 'asp', 'jsp'];

// Create directories if they don't exist
if (!is_dir($downloadDir)) mkdir($downloadDir, 0777, true);
if (!is_dir($logDir)) mkdir($logDir, 0777, true);

/**
 * Custom function to fetch content with a User-Agent
 */
function fetchUrl($url) {
    $options = [
        "http" => [
            "method" => "GET",
            "header" => "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\r\n"
        ]
    ];
    $context = stream_context_create($options);
    return @file_get_contents($url, false, $context);
}

/**
 * Resolves relative URLs to absolute ones
 */
function resolveUrl($base, $rel) {
    if (parse_url($rel, PHP_URL_SCHEME) != '') return $rel;
    if ($rel[0] == '#' || $rel[0] == '?') return $base . $rel;
    extract(parse_url($base));
    $path = preg_replace('#/[^/]*$#', '', $path);
    if ($rel[0] == '/') $path = '';
    $abs = "$host$path/$rel";
    $re = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
    for ($n = 1; $n > 0; $abs = preg_replace($re, '/', $abs, -1, $n));
    return $scheme . '://' . $abs;
}

/**
 * Main Crawl Logic
 */
function crawl($url, $depth, $maxDepth, $signature) {
    global $visitedUrls, $mediaExtensions, $webExtensions, $downloadDir, $logDir;

    if ($depth > $maxDepth || in_array($url, $visitedUrls)) return;
    $visitedUrls[] = $url;

    echo "<li>Processing: <b>$url</b></li>";

    $ext = strtolower(pathinfo(parse_url($url, PHP_URL_PATH), PATHINFO_EXTENSION));

    // Check if it's a Media File
    if (in_array($ext, $mediaExtensions)) {
        $hash = hash('sha256', $url);
        $fileName = $hash . '.' . $ext;
        $filePath = "$downloadDir/$fileName";

        if (!file_exists($filePath)) {
            $content = fetchUrl($url);
            if ($content) {
                file_put_contents($filePath, $content);
                
                // Write Log
                $logData = "URL: $url\nDate: " . date('Y-m-d H:i:s') . "\nSignature: $signature\nHash: $hash";
                file_put_contents("$logDir/$hash.txt", $logData);
                
                echo "<span style='color:green;'> [Downloaded]</span><br>";
            }
        } else {
            echo "<span style='color:orange;'> [Exists, Skipped]</span><br>";
        }
        return;
    }

    // Skip links with 3-4 letter extensions that aren't media or webpages
    if (!empty($ext) && !in_array($ext, $webExtensions) && strlen($ext) >= 3) {
        echo "<span style='color:gray;'> [Irrelevant extension, Skipped]</span><br>";
        return;
    }

    // If it's a page, find more links
    $html = fetchUrl($url);
    if ($html) {
        // Regex for href and src
        preg_match_all('/(href|src)=["\']([^"\']+)["\']/i', $html, $matches);
        foreach ($matches[2] as $foundUrl) {
            $absUrl = resolveUrl($url, $foundUrl);
            if (filter_var($absUrl, FILTER_VALIDATE_URL)) {
                crawl($absUrl, $depth + 1, $maxDepth, $signature);
            }
        }
    }
}
?>

<!DOCTYPE html>
<html>
<head>
    <title>PHP Media Crawler</title>
    <style>
        body { font-family: sans-serif; background: #f4f4f4; padding: 20px; }
        .container { background: #fff; padding: 20px; border-radius: 8px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }
        .log-output { background: #222; color: #0f0; padding: 15px; border-radius: 5px; height: 400px; overflow-y: scroll; font-size: 12px; margin-top: 20px; }
    </style>
</head>
<body>

<div class="container">
    <h2>Media Crawler</h2>
    <form method="POST">
        <input type="url" name="url" placeholder="https://example.com" required style="width: 300px;">
        <input type="text" name="signature" placeholder="Optional Signature">
        <button type="submit">Start Crawling</button>
    </form>

    <?php if ($_SERVER['REQUEST_METHOD'] === 'POST'): ?>
        <div class="log-output">
            <ul>
                <?php
                $startUrl = $_POST['url'];
                $userSig = !empty($_POST['signature']) ? $_POST['signature'] : 'No Signature Provided';
                crawl($startUrl, 0, $maxDepth, $userSig);
                ?>
            </ul>
        </div>
    <?php endif; ?>
</div>

</body>
</html>