309 lines
9.6 KiB
Python
Executable File
309 lines
9.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Anno 117 IGN Wiki Scraper
|
|
|
|
Scrapes pages from IGN's Anno 117 wiki, extracts game data,
|
|
discovers new pages, and updates the scraping list.
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Project paths
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
SCRAPING_MD = PROJECT_ROOT / "scraping_ign.md"
|
|
PROCESSED_MD = PROJECT_ROOT / "processed.md"
|
|
OUTPUT_DIR = PROJECT_ROOT / "scraped_data"
|
|
|
|
# IGN wiki base URL
|
|
IGN_WIKI_BASE = "https://www.ign.com/wikis/anno-117-pax-romana"
|
|
|
|
|
|
def read_scraping_md() -> tuple[list[str], list[str], str]:
|
|
"""
|
|
Read scraping_ign.md and return (unchecked_urls, checked_urls, full_content).
|
|
Creates the file with initial URLs if it doesn't exist.
|
|
"""
|
|
if not SCRAPING_MD.exists():
|
|
# Create initial scraping_ign.md with the main wiki page
|
|
initial_content = f"""# IGN Anno 117 Scraping URLs
|
|
|
|
## Pages to Scrape
|
|
|
|
- [ ] {IGN_WIKI_BASE}
|
|
"""
|
|
SCRAPING_MD.write_text(initial_content, encoding="utf-8")
|
|
print(f"Created {SCRAPING_MD} with initial URL")
|
|
|
|
content = SCRAPING_MD.read_text(encoding="utf-8")
|
|
|
|
unchecked = re.findall(r"^- \[ \] (https?://[^\s]+)", content, re.MULTILINE)
|
|
checked = re.findall(r"^- \[x\] (https?://[^\s]+)", content, re.MULTILINE)
|
|
|
|
return unchecked, checked, content
|
|
|
|
|
|
def mark_url_as_done(url: str) -> None:
|
|
"""Mark a URL as scraped in scraping_ign.md."""
|
|
content = SCRAPING_MD.read_text(encoding="utf-8")
|
|
|
|
# Use regex to match the exact URL (followed by newline or end of string)
|
|
escaped_url = re.escape(url)
|
|
pattern = rf"^- \[ \] {escaped_url}$"
|
|
replacement = f"- [x] {url}"
|
|
|
|
content = re.sub(pattern, replacement, content, flags=re.MULTILINE)
|
|
SCRAPING_MD.write_text(content, encoding="utf-8")
|
|
print(f"Marked as done: {url}")
|
|
|
|
|
|
def add_new_urls(new_urls: list[str]) -> None:
|
|
"""Add newly discovered URLs to scraping_ign.md if not already present."""
|
|
unchecked, checked, content = read_scraping_md()
|
|
existing = set(unchecked + checked)
|
|
|
|
urls_to_add = [url for url in new_urls if url not in existing]
|
|
|
|
if urls_to_add:
|
|
# Append new URLs at the end
|
|
additions = "\n".join(f"- [ ] {url}" for url in urls_to_add)
|
|
content = content.rstrip() + "\n" + additions + "\n"
|
|
SCRAPING_MD.write_text(content, encoding="utf-8")
|
|
print(f"Added {len(urls_to_add)} new URLs to scraping_ign.md")
|
|
for url in urls_to_add:
|
|
print(f" + {url}")
|
|
|
|
|
|
def extract_links(soup: BeautifulSoup, base_url: str) -> list[str]:
|
|
"""Extract relevant IGN wiki links from the page."""
|
|
links = []
|
|
|
|
for a in soup.find_all("a", href=True):
|
|
href = a["href"]
|
|
full_url = urljoin(base_url, href)
|
|
parsed = urlparse(full_url)
|
|
|
|
# Only keep links from IGN's Anno 117 wiki
|
|
if parsed.netloc == "www.ign.com" and "/wikis/anno-117-pax-romana" in parsed.path:
|
|
# Clean the URL (remove fragments and query params for deduplication)
|
|
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
# Remove trailing slash for consistency
|
|
clean_url = clean_url.rstrip("/")
|
|
|
|
if clean_url not in links and clean_url != IGN_WIKI_BASE.rstrip("/"):
|
|
links.append(clean_url)
|
|
|
|
return links
|
|
|
|
|
|
def scrape_page(url: str) -> dict:
|
|
"""
|
|
Scrape a single page and extract relevant content.
|
|
Returns a dict with the scraped data.
|
|
"""
|
|
print(f"Scraping: {url}")
|
|
|
|
headers = {
|
|
"User-Agent": "Anno117DocBot/1.0 (Personal use, one-time scrape for AI documentation; Contact: jivanrij@gmail.com)",
|
|
"X-Bot-Purpose": "Creating documentation for personal AI agent use (non-commercial). Each page is fetched only once.",
|
|
"From": "jivanrij@gmail.com",
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Extract page title
|
|
title = soup.find("title")
|
|
title_text = title.get_text(strip=True) if title else "Unknown"
|
|
|
|
# IGN wiki content is typically in article or wiki-specific containers
|
|
main_content = (
|
|
soup.find("div", class_="wiki-page-content")
|
|
or soup.find("article", class_="article")
|
|
or soup.find("div", class_="content-page")
|
|
or soup.find("main")
|
|
or soup.find("article")
|
|
or soup.find("div", class_="content")
|
|
)
|
|
|
|
# Extract all text content
|
|
if main_content:
|
|
text_content = main_content.get_text(separator="\n", strip=True)
|
|
else:
|
|
# Fallback to body
|
|
body = soup.find("body")
|
|
text_content = body.get_text(separator="\n", strip=True) if body else ""
|
|
|
|
# Extract tables (common for game data)
|
|
tables = []
|
|
for table in soup.find_all("table"):
|
|
table_data = []
|
|
for row in table.find_all("tr"):
|
|
cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
|
|
if cells:
|
|
table_data.append(cells)
|
|
if table_data:
|
|
tables.append(table_data)
|
|
|
|
# Extract images (for item/building icons)
|
|
images = []
|
|
for img in soup.find_all("img"):
|
|
src = img.get("src", "")
|
|
alt = img.get("alt", "")
|
|
if src and any(keyword in src.lower() for keyword in ["icon", "item", "building", "good", "anno"]):
|
|
images.append({"src": urljoin(url, src), "alt": alt})
|
|
|
|
# Extract discovered links
|
|
discovered_links = extract_links(soup, url)
|
|
|
|
# Extract all images (not just filtered ones)
|
|
all_images = []
|
|
for img in soup.find_all("img"):
|
|
src = img.get("src", "")
|
|
alt = img.get("alt", "")
|
|
if src:
|
|
all_images.append({"src": urljoin(url, src), "alt": alt})
|
|
|
|
return {
|
|
"url": url,
|
|
"source": "ign",
|
|
"title": title_text,
|
|
"text_content": text_content,
|
|
"tables": tables,
|
|
"images": images,
|
|
"all_images": all_images,
|
|
"discovered_links": discovered_links,
|
|
"full_html_length": len(response.text)
|
|
}
|
|
|
|
|
|
def save_scraped_data(data: dict, url: str) -> Path:
|
|
"""Save scraped data to a JSON file."""
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
|
|
# Create filename from URL with ign prefix
|
|
parsed = urlparse(url)
|
|
path_parts = [p for p in parsed.path.split("/") if p]
|
|
# Remove 'wikis' from path parts if present
|
|
path_parts = [p for p in path_parts if p != "wikis"]
|
|
filename = "ign_" + "_".join(path_parts) if path_parts else "ign_index"
|
|
filename = re.sub(r"[^\w\-]", "_", filename)
|
|
|
|
output_file = OUTPUT_DIR / f"{filename}.json"
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Saved to: {output_file}")
|
|
return output_file
|
|
|
|
|
|
def add_pending_file(json_filename: str) -> None:
|
|
"""Add a JSON file to the pending section of processed.md."""
|
|
content = PROCESSED_MD.read_text(encoding="utf-8")
|
|
|
|
# Check if file is already listed
|
|
if json_filename in content:
|
|
return
|
|
|
|
# Add to pending section
|
|
pending_marker = "## Pending Files"
|
|
if pending_marker in content:
|
|
content = content.replace(
|
|
pending_marker,
|
|
f"{pending_marker}\n- [ ] {json_filename}"
|
|
)
|
|
PROCESSED_MD.write_text(content, encoding="utf-8")
|
|
print(f"Added to processed.md pending: {json_filename}")
|
|
|
|
|
|
def scrape_one(url: str) -> bool:
|
|
"""Scrape a single URL. Returns True on success, False on failure."""
|
|
try:
|
|
# Scrape the page
|
|
data = scrape_page(url)
|
|
|
|
# Save the data
|
|
output_file = save_scraped_data(data, url)
|
|
|
|
# Track in processed.md as pending
|
|
add_pending_file(output_file.name)
|
|
|
|
# Print summary
|
|
print(f"\n--- Summary ---")
|
|
print(f"Title: {data['title']}")
|
|
print(f"Content length: {len(data['text_content'])} chars")
|
|
print(f"Tables found: {len(data['tables'])}")
|
|
print(f"Images found: {len(data['images'])}")
|
|
print(f"Links discovered: {len(data['discovered_links'])}")
|
|
|
|
# Add discovered links to scraping_ign.md
|
|
if data["discovered_links"]:
|
|
add_new_urls(data["discovered_links"])
|
|
|
|
# Mark this URL as done
|
|
mark_url_as_done(url)
|
|
|
|
print(f"\nSuccessfully scraped: {url}")
|
|
return True
|
|
|
|
except requests.RequestException as e:
|
|
print(f"Error scraping {url}: {e}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description="Scrape Anno 117 pages from IGN wiki")
|
|
parser.add_argument(
|
|
"-n", "--count",
|
|
type=int,
|
|
default=1,
|
|
help="Number of URLs to scrape (default: 1)"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
unchecked, checked, _ = read_scraping_md()
|
|
|
|
if not unchecked:
|
|
print("No unchecked URLs found in scraping_ign.md")
|
|
return
|
|
|
|
print(f"Found {len(unchecked)} unchecked URLs")
|
|
print(f"Already scraped: {len(checked)} URLs")
|
|
print(f"Will scrape: {min(args.count, len(unchecked))} URLs\n")
|
|
|
|
scraped = 0
|
|
failed = 0
|
|
|
|
urls_to_scrape = unchecked[:args.count]
|
|
total = len(urls_to_scrape)
|
|
|
|
for i, url in enumerate(urls_to_scrape):
|
|
print(f"[{i + 1}/{total}] ", end="")
|
|
if scrape_one(url):
|
|
scraped += 1
|
|
else:
|
|
failed += 1
|
|
print()
|
|
|
|
# Be nice to the server - wait 3 seconds between requests
|
|
if i < total - 1:
|
|
print("Waiting 3 seconds...")
|
|
time.sleep(3)
|
|
|
|
print(f"Done! Scraped: {scraped}, Failed: {failed}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|