adding data

This commit is contained in:
2025-12-30 14:50:25 +01:00
parent 30a2a13d20
commit df2b557585
4 changed files with 841 additions and 78 deletions

View File

@@ -6,8 +6,10 @@ Scrapes pages listed in scraping.md, extracts game data,
discovers new pages, and updates the scraping list.
"""
import argparse
import re
import json
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse
@@ -206,20 +208,8 @@ def add_pending_file(json_filename: str) -> None:
print(f"Added to processed.md pending: {json_filename}")
def main():
"""Main entry point - scrape one unchecked page."""
unchecked, checked, _ = read_scraping_md()
if not unchecked:
print("No unchecked URLs found in scraping.md")
return
print(f"Found {len(unchecked)} unchecked URLs")
print(f"Already scraped: {len(checked)} URLs")
# Take the first unchecked URL
url = unchecked[0]
def scrape_one(url: str) -> bool:
"""Scrape a single URL. Returns True on success, False on failure."""
try:
# Scrape the page
data = scrape_page(url)
@@ -246,10 +236,54 @@ def main():
mark_url_as_done(url)
print(f"\nSuccessfully scraped: {url}")
return True
except requests.RequestException as e:
print(f"Error scraping {url}: {e}")
raise
return False
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description="Scrape Anno 117 pages from scraping.md")
parser.add_argument(
"-n", "--count",
type=int,
default=1,
help="Number of URLs to scrape (default: 1)"
)
args = parser.parse_args()
unchecked, checked, _ = read_scraping_md()
if not unchecked:
print("No unchecked URLs found in scraping.md")
return
print(f"Found {len(unchecked)} unchecked URLs")
print(f"Already scraped: {len(checked)} URLs")
print(f"Will scrape: {min(args.count, len(unchecked))} URLs\n")
scraped = 0
failed = 0
urls_to_scrape = unchecked[:args.count]
total = len(urls_to_scrape)
for i, url in enumerate(urls_to_scrape):
print(f"[{i + 1}/{total}] ", end="")
if scrape_one(url):
scraped += 1
else:
failed += 1
print()
# Be nice to the server - wait 3 seconds between requests
if i < total - 1:
print("Waiting 3 seconds...")
time.sleep(3)
print(f"Done! Scraped: {scraped}, Failed: {failed}")
if __name__ == "__main__":