#!/usr/bin/env python3 """ Extract External Event Data from URL ===================================== Fetches an event page, downloads relevant images (posters, banners), and saves them locally for visual analysis by Claude. Usage: python3 scripts/extract_event_from_url.py Output: - Page text saved to /tmp/event_extract_text.md - Images saved to /tmp/event_extract_img_*.jpg/png - Summary printed to stdout Claude can then read these files (including images) to extract location, times, and other details that may only appear in graphics. """ import sys import os import re import requests from urllib.parse import urljoin, urlparse from html.parser import HTMLParser class ImageExtractor(HTMLParser): """Extract image URLs from HTML, filtering for likely event posters/banners.""" def __init__(self, base_url): super().__init__() self.base_url = base_url self.images = [] self._skip_classes = {'logo', 'icon', 'avatar', 'favicon', 'sprite', 'social'} self._skip_srcs = {'logo', 'icon', 'favicon', 'sprite', 'social', 'flag', 'emoji', 'pixel', 'tracking'} def handle_starttag(self, tag, attrs): if tag != 'img': return attr_dict = dict(attrs) src = attr_dict.get('src', '') if not src: return # Skip tiny images (tracking pixels, icons) width = attr_dict.get('width', '') height = attr_dict.get('height', '') if width and width.isdigit() and int(width) < 100: return if height and height.isdigit() and int(height) < 100: return # Skip by class css_class = attr_dict.get('class', '').lower() if any(skip in css_class for skip in self._skip_classes): return # Skip by src pattern src_lower = src.lower() if any(skip in src_lower for skip in self._skip_srcs): return # Make absolute URL full_url = urljoin(self.base_url, src) self.images.append({ 'url': full_url, 'alt': attr_dict.get('alt', ''), 'class': css_class, 'width': width, 'height': height, }) def fetch_page(url): """Fetch page HTML.""" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } resp = requests.get(url, headers=headers, timeout=30) resp.raise_for_status() return resp.text def download_image(url, filepath): """Download image to local file.""" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' } try: resp = requests.get(url, headers=headers, timeout=15) resp.raise_for_status() content_type = resp.headers.get('content-type', '') if 'image' not in content_type and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')): return False with open(filepath, 'wb') as f: f.write(resp.content) size = os.path.getsize(filepath) # Skip very small files (< 5KB likely icons) and very large (> 10MB) if size < 5000 or size > 10_000_000: os.remove(filepath) return False return True except Exception as e: print(f" Skip {url}: {e}") return False def extract_text_simple(html): """Very basic HTML to text conversion.""" # Remove script/style html = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) html = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) # Replace common block tags with newlines html = re.sub(r'<(?:p|div|br|h[1-6]|li|tr)[^>]*>', '\n', html, flags=re.IGNORECASE) # Remove all remaining tags html = re.sub(r'<[^>]+>', '', html) # Decode entities html = html.replace('&', '&').replace('<', '<').replace('>', '>').replace(' ', ' ') # Clean up whitespace lines = [line.strip() for line in html.split('\n')] lines = [line for line in lines if line] return '\n'.join(lines) def main(): if len(sys.argv) < 2: print("Usage: python3 scripts/extract_event_from_url.py ") sys.exit(1) url = sys.argv[1] print(f"Fetching: {url}") # Fetch page html = fetch_page(url) # Extract text text = extract_text_simple(html) text_path = '/tmp/event_extract_text.md' with open(text_path, 'w') as f: f.write(f"# Source: {url}\n\n{text}") print(f"Text saved: {text_path} ({len(text)} chars)") # Extract and download images parser = ImageExtractor(url) parser.feed(html) print(f"Found {len(parser.images)} candidate images") downloaded = [] for i, img in enumerate(parser.images): ext = os.path.splitext(urlparse(img['url']).path)[1] or '.jpg' if ext.lower() not in ('.jpg', '.jpeg', '.png', '.webp', '.gif'): ext = '.jpg' filepath = f'/tmp/event_extract_img_{i}{ext}' print(f" Downloading: {img['url'][:80]}...") if download_image(img['url'], filepath): size_kb = os.path.getsize(filepath) / 1024 downloaded.append({ 'path': filepath, 'url': img['url'], 'alt': img['alt'], 'size_kb': round(size_kb, 1), }) print(f" Saved: {filepath} ({size_kb:.0f} KB)") print(f"\nResults:") print(f" Page text: {text_path}") print(f" Images downloaded: {len(downloaded)}") for d in downloaded: print(f" {d['path']} ({d['size_kb']} KB) — {d['alt'][:60] if d['alt'] else 'no alt'}") if downloaded: print(f"\nClaude can now read these images with the Read tool to extract") print(f"location, times, and other visual-only information.") if __name__ == '__main__': main()