adding data

This commit is contained in:
2025-12-30 14:34:34 +01:00
parent 5f1b2d0474
commit 30a2a13d20
8 changed files with 1454 additions and 0 deletions

256
python/scraper.py Normal file
View File

@@ -0,0 +1,256 @@
#!/usr/bin/env python3
"""
Anno 117 Web Scraper
Scrapes pages listed in scraping.md, extracts game data,
discovers new pages, and updates the scraping list.
"""
import re
import json
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
# Project paths
PROJECT_ROOT = Path(__file__).parent.parent
SCRAPING_MD = PROJECT_ROOT / "scraping.md"
PROCESSED_MD = PROJECT_ROOT / "processed.md"
OUTPUT_DIR = PROJECT_ROOT / "scraped_data"
def read_scraping_md() -> tuple[list[str], list[str], str]:
"""
Read scraping.md and return (unchecked_urls, checked_urls, full_content).
"""
content = SCRAPING_MD.read_text(encoding="utf-8")
unchecked = re.findall(r"^- \[ \] (https?://[^\s]+)", content, re.MULTILINE)
checked = re.findall(r"^- \[x\] (https?://[^\s]+)", content, re.MULTILINE)
return unchecked, checked, content
def mark_url_as_done(url: str) -> None:
"""Mark a URL as scraped in scraping.md."""
content = SCRAPING_MD.read_text(encoding="utf-8")
# Use regex to match the exact URL (followed by newline or end of string)
# This prevents partial URL matches
escaped_url = re.escape(url)
pattern = rf"^- \[ \] {escaped_url}$"
replacement = f"- [x] {url}"
content = re.sub(pattern, replacement, content, flags=re.MULTILINE)
SCRAPING_MD.write_text(content, encoding="utf-8")
print(f"Marked as done: {url}")
def add_new_urls(new_urls: list[str]) -> None:
"""Add newly discovered URLs to scraping.md if not already present."""
unchecked, checked, content = read_scraping_md()
existing = set(unchecked + checked)
urls_to_add = [url for url in new_urls if url not in existing]
if urls_to_add:
# Append new URLs at the end
additions = "\n".join(f"- [ ] {url}" for url in urls_to_add)
content = content.rstrip() + "\n" + additions + "\n"
SCRAPING_MD.write_text(content, encoding="utf-8")
print(f"Added {len(urls_to_add)} new URLs to scraping.md")
for url in urls_to_add:
print(f" + {url}")
def extract_links(soup: BeautifulSoup, base_url: str) -> list[str]:
"""Extract relevant anno.land links from the page."""
links = []
base_domain = urlparse(base_url).netloc
for a in soup.find_all("a", href=True):
href = a["href"]
full_url = urljoin(base_url, href)
parsed = urlparse(full_url)
# Only keep links from the same domain and relevant paths
if parsed.netloc == base_domain:
# Filter for Anno 117 related content
if any(keyword in parsed.path for keyword in [
"anno-117", "goods", "buildings", "specialists",
"skills", "guides", "tools", "datenbank"
]):
# Clean the URL (remove fragments and query params for deduplication)
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
if clean_url.endswith("/"):
clean_url = clean_url.rstrip("/") + "/"
else:
clean_url = clean_url + "/" if not parsed.path.endswith((".html", ".php")) else clean_url
if clean_url not in links:
links.append(clean_url)
return links
def scrape_page(url: str) -> dict:
"""
Scrape a single page and extract relevant content.
Returns a dict with the scraped data.
"""
print(f"Scraping: {url}")
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract page title
title = soup.find("title")
title_text = title.get_text(strip=True) if title else "Unknown"
# Extract main content (adjust selectors based on site structure)
main_content = soup.find("main") or soup.find("article") or soup.find("div", class_="content")
# Extract all text content
if main_content:
text_content = main_content.get_text(separator="\n", strip=True)
else:
# Fallback to body
body = soup.find("body")
text_content = body.get_text(separator="\n", strip=True) if body else ""
# Extract tables (common for game data)
tables = []
for table in soup.find_all("table"):
table_data = []
for row in table.find_all("tr"):
cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
if cells:
table_data.append(cells)
if table_data:
tables.append(table_data)
# Extract images (for item/building icons)
images = []
for img in soup.find_all("img"):
src = img.get("src", "")
alt = img.get("alt", "")
if src and any(keyword in src.lower() for keyword in ["icon", "item", "building", "good"]):
images.append({"src": urljoin(url, src), "alt": alt})
# Extract discovered links
discovered_links = extract_links(soup, url)
# Extract all images (not just filtered ones)
all_images = []
for img in soup.find_all("img"):
src = img.get("src", "")
alt = img.get("alt", "")
if src:
all_images.append({"src": urljoin(url, src), "alt": alt})
return {
"url": url,
"title": title_text,
"text_content": text_content, # Full content for Claude to process
"tables": tables,
"images": images, # Filtered icons
"all_images": all_images, # All images on page
"discovered_links": discovered_links,
"full_html_length": len(response.text)
}
def save_scraped_data(data: dict, url: str) -> Path:
"""Save scraped data to a JSON file."""
OUTPUT_DIR.mkdir(exist_ok=True)
# Create filename from URL
parsed = urlparse(url)
path_parts = [p for p in parsed.path.split("/") if p]
filename = "_".join(path_parts) if path_parts else "index"
filename = re.sub(r"[^\w\-]", "_", filename)
output_file = OUTPUT_DIR / f"{filename}.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"Saved to: {output_file}")
return output_file
def add_pending_file(json_filename: str) -> None:
"""Add a JSON file to the pending section of processed.md."""
content = PROCESSED_MD.read_text(encoding="utf-8")
# Check if file is already listed
if json_filename in content:
return
# Add to pending section
pending_marker = "## Pending Files"
if pending_marker in content:
content = content.replace(
pending_marker,
f"{pending_marker}\n- [ ] {json_filename}"
)
PROCESSED_MD.write_text(content, encoding="utf-8")
print(f"Added to processed.md pending: {json_filename}")
def main():
"""Main entry point - scrape one unchecked page."""
unchecked, checked, _ = read_scraping_md()
if not unchecked:
print("No unchecked URLs found in scraping.md")
return
print(f"Found {len(unchecked)} unchecked URLs")
print(f"Already scraped: {len(checked)} URLs")
# Take the first unchecked URL
url = unchecked[0]
try:
# Scrape the page
data = scrape_page(url)
# Save the data
output_file = save_scraped_data(data, url)
# Track in processed.md as pending
add_pending_file(output_file.name)
# Print summary
print(f"\n--- Summary ---")
print(f"Title: {data['title']}")
print(f"Content length: {len(data['text_content'])} chars")
print(f"Tables found: {len(data['tables'])}")
print(f"Images found: {len(data['images'])}")
print(f"Links discovered: {len(data['discovered_links'])}")
# Add discovered links to scraping.md
if data["discovered_links"]:
add_new_urls(data["discovered_links"])
# Mark this URL as done
mark_url_as_done(url)
print(f"\nSuccessfully scraped: {url}")
except requests.RequestException as e:
print(f"Error scraping {url}: {e}")
raise
if __name__ == "__main__":
main()