adding data
This commit is contained in:
256
python/scraper.py
Normal file
256
python/scraper.py
Normal file
@@ -0,0 +1,256 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Anno 117 Web Scraper
|
||||
|
||||
Scrapes pages listed in scraping.md, extracts game data,
|
||||
discovers new pages, and updates the scraping list.
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Project paths
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
SCRAPING_MD = PROJECT_ROOT / "scraping.md"
|
||||
PROCESSED_MD = PROJECT_ROOT / "processed.md"
|
||||
OUTPUT_DIR = PROJECT_ROOT / "scraped_data"
|
||||
|
||||
|
||||
def read_scraping_md() -> tuple[list[str], list[str], str]:
|
||||
"""
|
||||
Read scraping.md and return (unchecked_urls, checked_urls, full_content).
|
||||
"""
|
||||
content = SCRAPING_MD.read_text(encoding="utf-8")
|
||||
|
||||
unchecked = re.findall(r"^- \[ \] (https?://[^\s]+)", content, re.MULTILINE)
|
||||
checked = re.findall(r"^- \[x\] (https?://[^\s]+)", content, re.MULTILINE)
|
||||
|
||||
return unchecked, checked, content
|
||||
|
||||
|
||||
def mark_url_as_done(url: str) -> None:
|
||||
"""Mark a URL as scraped in scraping.md."""
|
||||
content = SCRAPING_MD.read_text(encoding="utf-8")
|
||||
|
||||
# Use regex to match the exact URL (followed by newline or end of string)
|
||||
# This prevents partial URL matches
|
||||
escaped_url = re.escape(url)
|
||||
pattern = rf"^- \[ \] {escaped_url}$"
|
||||
replacement = f"- [x] {url}"
|
||||
|
||||
content = re.sub(pattern, replacement, content, flags=re.MULTILINE)
|
||||
SCRAPING_MD.write_text(content, encoding="utf-8")
|
||||
print(f"Marked as done: {url}")
|
||||
|
||||
|
||||
def add_new_urls(new_urls: list[str]) -> None:
|
||||
"""Add newly discovered URLs to scraping.md if not already present."""
|
||||
unchecked, checked, content = read_scraping_md()
|
||||
existing = set(unchecked + checked)
|
||||
|
||||
urls_to_add = [url for url in new_urls if url not in existing]
|
||||
|
||||
if urls_to_add:
|
||||
# Append new URLs at the end
|
||||
additions = "\n".join(f"- [ ] {url}" for url in urls_to_add)
|
||||
content = content.rstrip() + "\n" + additions + "\n"
|
||||
SCRAPING_MD.write_text(content, encoding="utf-8")
|
||||
print(f"Added {len(urls_to_add)} new URLs to scraping.md")
|
||||
for url in urls_to_add:
|
||||
print(f" + {url}")
|
||||
|
||||
|
||||
def extract_links(soup: BeautifulSoup, base_url: str) -> list[str]:
|
||||
"""Extract relevant anno.land links from the page."""
|
||||
links = []
|
||||
base_domain = urlparse(base_url).netloc
|
||||
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
full_url = urljoin(base_url, href)
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# Only keep links from the same domain and relevant paths
|
||||
if parsed.netloc == base_domain:
|
||||
# Filter for Anno 117 related content
|
||||
if any(keyword in parsed.path for keyword in [
|
||||
"anno-117", "goods", "buildings", "specialists",
|
||||
"skills", "guides", "tools", "datenbank"
|
||||
]):
|
||||
# Clean the URL (remove fragments and query params for deduplication)
|
||||
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
||||
if clean_url.endswith("/"):
|
||||
clean_url = clean_url.rstrip("/") + "/"
|
||||
else:
|
||||
clean_url = clean_url + "/" if not parsed.path.endswith((".html", ".php")) else clean_url
|
||||
|
||||
if clean_url not in links:
|
||||
links.append(clean_url)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
def scrape_page(url: str) -> dict:
|
||||
"""
|
||||
Scrape a single page and extract relevant content.
|
||||
Returns a dict with the scraped data.
|
||||
"""
|
||||
print(f"Scraping: {url}")
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Extract page title
|
||||
title = soup.find("title")
|
||||
title_text = title.get_text(strip=True) if title else "Unknown"
|
||||
|
||||
# Extract main content (adjust selectors based on site structure)
|
||||
main_content = soup.find("main") or soup.find("article") or soup.find("div", class_="content")
|
||||
|
||||
# Extract all text content
|
||||
if main_content:
|
||||
text_content = main_content.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
# Fallback to body
|
||||
body = soup.find("body")
|
||||
text_content = body.get_text(separator="\n", strip=True) if body else ""
|
||||
|
||||
# Extract tables (common for game data)
|
||||
tables = []
|
||||
for table in soup.find_all("table"):
|
||||
table_data = []
|
||||
for row in table.find_all("tr"):
|
||||
cells = [cell.get_text(strip=True) for cell in row.find_all(["td", "th"])]
|
||||
if cells:
|
||||
table_data.append(cells)
|
||||
if table_data:
|
||||
tables.append(table_data)
|
||||
|
||||
# Extract images (for item/building icons)
|
||||
images = []
|
||||
for img in soup.find_all("img"):
|
||||
src = img.get("src", "")
|
||||
alt = img.get("alt", "")
|
||||
if src and any(keyword in src.lower() for keyword in ["icon", "item", "building", "good"]):
|
||||
images.append({"src": urljoin(url, src), "alt": alt})
|
||||
|
||||
# Extract discovered links
|
||||
discovered_links = extract_links(soup, url)
|
||||
|
||||
# Extract all images (not just filtered ones)
|
||||
all_images = []
|
||||
for img in soup.find_all("img"):
|
||||
src = img.get("src", "")
|
||||
alt = img.get("alt", "")
|
||||
if src:
|
||||
all_images.append({"src": urljoin(url, src), "alt": alt})
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title_text,
|
||||
"text_content": text_content, # Full content for Claude to process
|
||||
"tables": tables,
|
||||
"images": images, # Filtered icons
|
||||
"all_images": all_images, # All images on page
|
||||
"discovered_links": discovered_links,
|
||||
"full_html_length": len(response.text)
|
||||
}
|
||||
|
||||
|
||||
def save_scraped_data(data: dict, url: str) -> Path:
|
||||
"""Save scraped data to a JSON file."""
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Create filename from URL
|
||||
parsed = urlparse(url)
|
||||
path_parts = [p for p in parsed.path.split("/") if p]
|
||||
filename = "_".join(path_parts) if path_parts else "index"
|
||||
filename = re.sub(r"[^\w\-]", "_", filename)
|
||||
|
||||
output_file = OUTPUT_DIR / f"{filename}.json"
|
||||
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"Saved to: {output_file}")
|
||||
return output_file
|
||||
|
||||
|
||||
def add_pending_file(json_filename: str) -> None:
|
||||
"""Add a JSON file to the pending section of processed.md."""
|
||||
content = PROCESSED_MD.read_text(encoding="utf-8")
|
||||
|
||||
# Check if file is already listed
|
||||
if json_filename in content:
|
||||
return
|
||||
|
||||
# Add to pending section
|
||||
pending_marker = "## Pending Files"
|
||||
if pending_marker in content:
|
||||
content = content.replace(
|
||||
pending_marker,
|
||||
f"{pending_marker}\n- [ ] {json_filename}"
|
||||
)
|
||||
PROCESSED_MD.write_text(content, encoding="utf-8")
|
||||
print(f"Added to processed.md pending: {json_filename}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point - scrape one unchecked page."""
|
||||
unchecked, checked, _ = read_scraping_md()
|
||||
|
||||
if not unchecked:
|
||||
print("No unchecked URLs found in scraping.md")
|
||||
return
|
||||
|
||||
print(f"Found {len(unchecked)} unchecked URLs")
|
||||
print(f"Already scraped: {len(checked)} URLs")
|
||||
|
||||
# Take the first unchecked URL
|
||||
url = unchecked[0]
|
||||
|
||||
try:
|
||||
# Scrape the page
|
||||
data = scrape_page(url)
|
||||
|
||||
# Save the data
|
||||
output_file = save_scraped_data(data, url)
|
||||
|
||||
# Track in processed.md as pending
|
||||
add_pending_file(output_file.name)
|
||||
|
||||
# Print summary
|
||||
print(f"\n--- Summary ---")
|
||||
print(f"Title: {data['title']}")
|
||||
print(f"Content length: {len(data['text_content'])} chars")
|
||||
print(f"Tables found: {len(data['tables'])}")
|
||||
print(f"Images found: {len(data['images'])}")
|
||||
print(f"Links discovered: {len(data['discovered_links'])}")
|
||||
|
||||
# Add discovered links to scraping.md
|
||||
if data["discovered_links"]:
|
||||
add_new_urls(data["discovered_links"])
|
||||
|
||||
# Mark this URL as done
|
||||
mark_url_as_done(url)
|
||||
|
||||
print(f"\nSuccessfully scraped: {url}")
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"Error scraping {url}: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user