adding data
This commit is contained in:
@@ -6,8 +6,10 @@ Scrapes pages listed in scraping.md, extracts game data,
|
||||
discovers new pages, and updates the scraping list.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
@@ -206,20 +208,8 @@ def add_pending_file(json_filename: str) -> None:
|
||||
print(f"Added to processed.md pending: {json_filename}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point - scrape one unchecked page."""
|
||||
unchecked, checked, _ = read_scraping_md()
|
||||
|
||||
if not unchecked:
|
||||
print("No unchecked URLs found in scraping.md")
|
||||
return
|
||||
|
||||
print(f"Found {len(unchecked)} unchecked URLs")
|
||||
print(f"Already scraped: {len(checked)} URLs")
|
||||
|
||||
# Take the first unchecked URL
|
||||
url = unchecked[0]
|
||||
|
||||
def scrape_one(url: str) -> bool:
|
||||
"""Scrape a single URL. Returns True on success, False on failure."""
|
||||
try:
|
||||
# Scrape the page
|
||||
data = scrape_page(url)
|
||||
@@ -246,10 +236,54 @@ def main():
|
||||
mark_url_as_done(url)
|
||||
|
||||
print(f"\nSuccessfully scraped: {url}")
|
||||
return True
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error scraping {url}: {e}")
|
||||
raise
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(description="Scrape Anno 117 pages from scraping.md")
|
||||
parser.add_argument(
|
||||
"-n", "--count",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of URLs to scrape (default: 1)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
unchecked, checked, _ = read_scraping_md()
|
||||
|
||||
if not unchecked:
|
||||
print("No unchecked URLs found in scraping.md")
|
||||
return
|
||||
|
||||
print(f"Found {len(unchecked)} unchecked URLs")
|
||||
print(f"Already scraped: {len(checked)} URLs")
|
||||
print(f"Will scrape: {min(args.count, len(unchecked))} URLs\n")
|
||||
|
||||
scraped = 0
|
||||
failed = 0
|
||||
|
||||
urls_to_scrape = unchecked[:args.count]
|
||||
total = len(urls_to_scrape)
|
||||
|
||||
for i, url in enumerate(urls_to_scrape):
|
||||
print(f"[{i + 1}/{total}] ", end="")
|
||||
if scrape_one(url):
|
||||
scraped += 1
|
||||
else:
|
||||
failed += 1
|
||||
print()
|
||||
|
||||
# Be nice to the server - wait 3 seconds between requests
|
||||
if i < total - 1:
|
||||
print("Waiting 3 seconds...")
|
||||
time.sleep(3)
|
||||
|
||||
print(f"Done! Scraped: {scraped}, Failed: {failed}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user