feat(scripts): add extract_event_from_url.py for visual data extraction

Downloads page text and images from external event URLs so Claude can visually analyze posters/banners for location, times, and other details not present in page text (e.g. venue address in graphics). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 10:48:07 +01:00 · 2026-03-19 10:48:07 +01:00 · fe304e895f
commit fe304e895f
parent 32e5c901c4
1 changed files with 181 additions and 0 deletions
--- a/scripts/extract_event_from_url.py
+++ b/scripts/extract_event_from_url.py
@ -0,0 +1,181 @@
 #!/usr/bin/env python3
 """
 Extract External Event Data from URL
 =====================================
 Fetches an event page, downloads relevant images (posters, banners),
 and saves them locally for visual analysis by Claude.
 Usage:
    python3 scripts/extract_event_from_url.py <URL>
 Output:
    - Page text saved to /tmp/event_extract_text.md
    - Images saved to /tmp/event_extract_img_*.jpg/png
    - Summary printed to stdout
 Claude can then read these files (including images) to extract
 location, times, and other details that may only appear in graphics.
 """
 import sys
 import os
 import re
 import requests
 from urllib.parse import urljoin, urlparse
 from html.parser import HTMLParser
 class ImageExtractor(HTMLParser):
    """Extract image URLs from HTML, filtering for likely event posters/banners."""
    def __init__(self, base_url):
        super().__init__()
        self.base_url = base_url
        self.images = []
        self._skip_classes = {'logo', 'icon', 'avatar', 'favicon', 'sprite', 'social'}
        self._skip_srcs = {'logo', 'icon', 'favicon', 'sprite', 'social', 'flag', 'emoji', 'pixel', 'tracking'}
    def handle_starttag(self, tag, attrs):
        if tag != 'img':
            return
        attr_dict = dict(attrs)
        src = attr_dict.get('src', '')
        if not src:
            return
        # Skip tiny images (tracking pixels, icons)
        width = attr_dict.get('width', '')
        height = attr_dict.get('height', '')
        if width and width.isdigit() and int(width) < 100:
            return
        if height and height.isdigit() and int(height) < 100:
            return
        # Skip by class
        css_class = attr_dict.get('class', '').lower()
        if any(skip in css_class for skip in self._skip_classes):
            return
        # Skip by src pattern
        src_lower = src.lower()
        if any(skip in src_lower for skip in self._skip_srcs):
            return
        # Make absolute URL
        full_url = urljoin(self.base_url, src)
        self.images.append({
            'url': full_url,
            'alt': attr_dict.get('alt', ''),
            'class': css_class,
            'width': width,
            'height': height,
        })
 def fetch_page(url):
    """Fetch page HTML."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    return resp.text
 def download_image(url, filepath):
    """Download image to local file."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
    }
    try:
        resp = requests.get(url, headers=headers, timeout=15)
        resp.raise_for_status()
        content_type = resp.headers.get('content-type', '')
        if 'image' not in content_type and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
            return False
        with open(filepath, 'wb') as f:
            f.write(resp.content)
        size = os.path.getsize(filepath)
        # Skip very small files (< 5KB likely icons) and very large (> 10MB)
        if size < 5000 or size > 10_000_000:
            os.remove(filepath)
            return False
        return True
    except Exception as e:
        print(f"  Skip {url}: {e}")
        return False
 def extract_text_simple(html):
    """Very basic HTML to text conversion."""
    # Remove script/style
    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
    # Replace common block tags with newlines
    html = re.sub(r'<(?:p|div|br|h[1-6]|li|tr)[^>]*>', '\n', html, flags=re.IGNORECASE)
    # Remove all remaining tags
    html = re.sub(r'<[^>]+>', '', html)
    # Decode entities
    html = html.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&nbsp;', ' ')
    # Clean up whitespace
    lines = [line.strip() for line in html.split('\n')]
    lines = [line for line in lines if line]
    return '\n'.join(lines)
 def main():
    if len(sys.argv) < 2:
        print("Usage: python3 scripts/extract_event_from_url.py <URL>")
        sys.exit(1)
    url = sys.argv[1]
    print(f"Fetching: {url}")
    # Fetch page
    html = fetch_page(url)
    # Extract text
    text = extract_text_simple(html)
    text_path = '/tmp/event_extract_text.md'
    with open(text_path, 'w') as f:
        f.write(f"# Source: {url}\n\n{text}")
    print(f"Text saved: {text_path} ({len(text)} chars)")
    # Extract and download images
    parser = ImageExtractor(url)
    parser.feed(html)
    print(f"Found {len(parser.images)} candidate images")
    downloaded = []
    for i, img in enumerate(parser.images):
        ext = os.path.splitext(urlparse(img['url']).path)[1] or '.jpg'
        if ext.lower() not in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
            ext = '.jpg'
        filepath = f'/tmp/event_extract_img_{i}{ext}'
        print(f"  Downloading: {img['url'][:80]}...")
        if download_image(img['url'], filepath):
            size_kb = os.path.getsize(filepath) / 1024
            downloaded.append({
                'path': filepath,
                'url': img['url'],
                'alt': img['alt'],
                'size_kb': round(size_kb, 1),
            })
            print(f"    Saved: {filepath} ({size_kb:.0f} KB)")
    print(f"\nResults:")
    print(f"  Page text: {text_path}")
    print(f"  Images downloaded: {len(downloaded)}")
    for d in downloaded:
        print(f"    {d['path']} ({d['size_kb']} KB) — {d['alt'][:60] if d['alt'] else 'no alt'}")
    if downloaded:
        print(f"\nClaude can now read these images with the Read tool to extract")
        print(f"location, times, and other visual-only information.")
 if __name__ == '__main__':
    main()