Files
anno-117-docs/python/scraper_ign.py
2025-12-30 18:22:26 +01:00

309 lines
9.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Anno 117 IGN Wiki Scraper
Scrapes pages from IGN's Anno 117 wiki, extracts game data,
discovers new pages, and updates the scraping list.
"""
import argparse
import re
import json
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
SCRAPING_MD = PROJECT_ROOT / "scraping_ign.md"
PROCESSED_MD = PROJECT_ROOT / "processed.md"
OUTPUT_DIR = PROJECT_ROOT / "scraped_data"
# IGN wiki base URL
IGN_WIKI_BASE = "https://www.ign.com/wikis/anno-117-pax-romana"
def read_scraping_md() -> tuple[list[str], list[str], str]:
"""
Read scraping_ign.md and return (unchecked_urls, checked_urls, full_content).
Creates the file with initial URLs if it doesn't exist.
"""
if not SCRAPING_MD.exists():
# Create initial scraping_ign.md with the main wiki page
initial_content = f"""# IGN Anno 117 Scraping URLs
## Pages to Scrape
- [ ] {IGN_WIKI_BASE}
"""
SCRAPING_MD.write_text(initial_content, encoding="utf-8")
print(f"Created {SCRAPING_MD} with initial URL")
content = SCRAPING_MD.read_text(encoding="utf-8")
unchecked = re.findall(r"^- \[ \] (https?://[^\s]+)", content, re.MULTILINE)
checked = re.findall(r"^- \[x\] (https?://[^\s]+)", content, re.MULTILINE)
return unchecked, checked, content
def mark_url_as_done(url: str) -> None:
"""Mark a URL as scraped in scraping_ign.md."""
content = SCRAPING_MD.read_text(encoding="utf-8")
# Use regex to match the exact URL (followed by newline or end of string)
escaped_url = re.escape(url)
pattern = rf"^- \[ \] {escaped_url}$"
replacement = f"- [x] {url}"
content = re.sub(pattern, replacement, content, flags=re.MULTILINE)
SCRAPING_MD.write_text(content, encoding="utf-8")
print(f"Marked as done: {url}")
def add_new_urls(new_urls: list[str]) -> None:
"""Add newly discovered URLs to scraping_ign.md if not already present."""
unchecked, checked, content = read_scraping_md()
existing = set(unchecked + checked)
urls_to_add = [url for url in new_urls if url not in existing]
if urls_to_add:
# Append new URLs at the end
additions = "\n".join(f"- [ ] {url}" for url in urls_to_add)
content = content.rstrip() + "\n" + additions + "\n"
SCRAPING_MD.write_text(content, encoding="utf-8")
print(f"Added {len(urls_to_add)} new URLs to scraping_ign.md")
for url in urls_to_add:
print(f" + {url}")
def extract_links(soup: BeautifulSoup, base_url: str) -> list[str]:
"""Extract relevant IGN wiki links from the page."""
links = []
for a in soup.find_all("a", href=True):
href = a["href"]
full_url = urljoin(base_url, href)
parsed = urlparse(full_url)
# Only keep links from IGN's Anno 117 wiki
if parsed.netloc == "www.ign.com" and "/wikis/anno-117-pax-romana" in parsed.path:
# Clean the URL (remove fragments and query params for deduplication)
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
# Remove trailing slash for consistency
clean_url = clean_url.rstrip("/")
if clean_url not in links and clean_url != IGN_WIKI_BASE.rstrip("/"):
links.append(clean_url)
return links
def scrape_page(url: str) -> dict:
"""
Scrape a single page and extract relevant content.
Returns a dict with the scraped data.
"""
print(f"Scraping: {url}")
headers = {
"User-Agent": "Anno117DocBot/1.0 (Personal use, one-time scrape for AI documentation; Contact: jivanrij@gmail.com)",
"X-Bot-Purpose": "Creating documentation for personal AI agent use (non-commercial). Each page is fetched only once.",
"From": "jivanrij@gmail.com",
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract page title
title = soup.find("title")
title_text = title.get_text(strip=True) if title else "Unknown"
# IGN wiki content is typically in article or wiki-specific containers
main_content = (
soup.find("div", class_="wiki-page-content")
or soup.find("article", class_="article")
or soup.find("div", class_="content-page")
or soup.find("main")
or soup.find("article")
or soup.find("div", class_="content")
)
# Extract all text content
if main_content:
text_content = main_content.get_text(separator="\n", strip=True)
else:
# Fallback to body
body = soup.find("body")
text_content = body.get_text(separator="\n", strip=True) if body else ""
# Extract tables (common for game data)
tables = []
for table in soup.find_all("table"):
table_data = []
for row in table.find_all("tr"):
cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
if cells:
table_data.append(cells)
if table_data:
tables.append(table_data)
# Extract images (for item/building icons)
images = []
for img in soup.find_all("img"):
src = img.get("src", "")
alt = img.get("alt", "")
if src and any(keyword in src.lower() for keyword in ["icon", "item", "building", "good", "anno"]):
images.append({"src": urljoin(url, src), "alt": alt})
# Extract discovered links
discovered_links = extract_links(soup, url)
# Extract all images (not just filtered ones)
all_images = []
for img in soup.find_all("img"):
src = img.get("src", "")
alt = img.get("alt", "")
if src:
all_images.append({"src": urljoin(url, src), "alt": alt})
return {
"url": url,
"source": "ign",
"title": title_text,
"text_content": text_content,
"tables": tables,
"images": images,
"all_images": all_images,
"discovered_links": discovered_links,
"full_html_length": len(response.text)
}
def save_scraped_data(data: dict, url: str) -> Path:
"""Save scraped data to a JSON file."""
OUTPUT_DIR.mkdir(exist_ok=True)
# Create filename from URL with ign prefix
parsed = urlparse(url)
path_parts = [p for p in parsed.path.split("/") if p]
# Remove 'wikis' from path parts if present
path_parts = [p for p in path_parts if p != "wikis"]
filename = "ign_" + "_".join(path_parts) if path_parts else "ign_index"
filename = re.sub(r"[^\w\-]", "_", filename)
output_file = OUTPUT_DIR / f"{filename}.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"Saved to: {output_file}")
return output_file
def add_pending_file(json_filename: str) -> None:
"""Add a JSON file to the pending section of processed.md."""
content = PROCESSED_MD.read_text(encoding="utf-8")
# Check if file is already listed
if json_filename in content:
return
# Add to pending section
pending_marker = "## Pending Files"
if pending_marker in content:
content = content.replace(
pending_marker,
f"{pending_marker}\n- [ ] {json_filename}"
)
PROCESSED_MD.write_text(content, encoding="utf-8")
print(f"Added to processed.md pending: {json_filename}")
def scrape_one(url: str) -> bool:
"""Scrape a single URL. Returns True on success, False on failure."""
try:
# Scrape the page
data = scrape_page(url)
# Save the data
output_file = save_scraped_data(data, url)
# Track in processed.md as pending
add_pending_file(output_file.name)
# Print summary
print(f"\n--- Summary ---")
print(f"Title: {data['title']}")
print(f"Content length: {len(data['text_content'])} chars")
print(f"Tables found: {len(data['tables'])}")
print(f"Images found: {len(data['images'])}")
print(f"Links discovered: {len(data['discovered_links'])}")
# Add discovered links to scraping_ign.md
if data["discovered_links"]:
add_new_urls(data["discovered_links"])
# Mark this URL as done
mark_url_as_done(url)
print(f"\nSuccessfully scraped: {url}")
return True
except requests.RequestException as e:
print(f"Error scraping {url}: {e}")
return False
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(description="Scrape Anno 117 pages from IGN wiki")
parser.add_argument(
"-n", "--count",
type=int,
default=1,
help="Number of URLs to scrape (default: 1)"
)
args = parser.parse_args()
unchecked, checked, _ = read_scraping_md()
if not unchecked:
print("No unchecked URLs found in scraping_ign.md")
return
print(f"Found {len(unchecked)} unchecked URLs")
print(f"Already scraped: {len(checked)} URLs")
print(f"Will scrape: {min(args.count, len(unchecked))} URLs\n")
scraped = 0
failed = 0
urls_to_scrape = unchecked[:args.count]
total = len(urls_to_scrape)
for i, url in enumerate(urls_to_scrape):
print(f"[{i + 1}/{total}] ", end="")
if scrape_one(url):
scraped += 1
else:
failed += 1
print()
# Be nice to the server - wait 3 seconds between requests
if i < total - 1:
print("Waiting 3 seconds...")
time.sleep(3)
print(f"Done! Scraped: {scraped}, Failed: {failed}")
if __name__ == "__main__":
main()