feat(scripts): add extract_event_from_url.py for visual data extraction
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Downloads page text and images from external event URLs so Claude can visually analyze posters/banners for location, times, and other details not present in page text (e.g. venue address in graphics). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
32e5c901c4
commit
fe304e895f
181
scripts/extract_event_from_url.py
Normal file
181
scripts/extract_event_from_url.py
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Extract External Event Data from URL
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
Fetches an event page, downloads relevant images (posters, banners),
|
||||||
|
and saves them locally for visual analysis by Claude.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/extract_event_from_url.py <URL>
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- Page text saved to /tmp/event_extract_text.md
|
||||||
|
- Images saved to /tmp/event_extract_img_*.jpg/png
|
||||||
|
- Summary printed to stdout
|
||||||
|
|
||||||
|
Claude can then read these files (including images) to extract
|
||||||
|
location, times, and other details that may only appear in graphics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
|
||||||
|
class ImageExtractor(HTMLParser):
|
||||||
|
"""Extract image URLs from HTML, filtering for likely event posters/banners."""
|
||||||
|
|
||||||
|
def __init__(self, base_url):
|
||||||
|
super().__init__()
|
||||||
|
self.base_url = base_url
|
||||||
|
self.images = []
|
||||||
|
self._skip_classes = {'logo', 'icon', 'avatar', 'favicon', 'sprite', 'social'}
|
||||||
|
self._skip_srcs = {'logo', 'icon', 'favicon', 'sprite', 'social', 'flag', 'emoji', 'pixel', 'tracking'}
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag != 'img':
|
||||||
|
return
|
||||||
|
|
||||||
|
attr_dict = dict(attrs)
|
||||||
|
src = attr_dict.get('src', '')
|
||||||
|
if not src:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip tiny images (tracking pixels, icons)
|
||||||
|
width = attr_dict.get('width', '')
|
||||||
|
height = attr_dict.get('height', '')
|
||||||
|
if width and width.isdigit() and int(width) < 100:
|
||||||
|
return
|
||||||
|
if height and height.isdigit() and int(height) < 100:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip by class
|
||||||
|
css_class = attr_dict.get('class', '').lower()
|
||||||
|
if any(skip in css_class for skip in self._skip_classes):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Skip by src pattern
|
||||||
|
src_lower = src.lower()
|
||||||
|
if any(skip in src_lower for skip in self._skip_srcs):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Make absolute URL
|
||||||
|
full_url = urljoin(self.base_url, src)
|
||||||
|
self.images.append({
|
||||||
|
'url': full_url,
|
||||||
|
'alt': attr_dict.get('alt', ''),
|
||||||
|
'class': css_class,
|
||||||
|
'width': width,
|
||||||
|
'height': height,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url):
|
||||||
|
"""Fetch page HTML."""
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||||
|
}
|
||||||
|
resp = requests.get(url, headers=headers, timeout=30)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.text
|
||||||
|
|
||||||
|
|
||||||
|
def download_image(url, filepath):
|
||||||
|
"""Download image to local file."""
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, headers=headers, timeout=15)
|
||||||
|
resp.raise_for_status()
|
||||||
|
content_type = resp.headers.get('content-type', '')
|
||||||
|
if 'image' not in content_type and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
|
||||||
|
return False
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
f.write(resp.content)
|
||||||
|
size = os.path.getsize(filepath)
|
||||||
|
# Skip very small files (< 5KB likely icons) and very large (> 10MB)
|
||||||
|
if size < 5000 or size > 10_000_000:
|
||||||
|
os.remove(filepath)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Skip {url}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_simple(html):
|
||||||
|
"""Very basic HTML to text conversion."""
|
||||||
|
# Remove script/style
|
||||||
|
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||||
|
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||||
|
# Replace common block tags with newlines
|
||||||
|
html = re.sub(r'<(?:p|div|br|h[1-6]|li|tr)[^>]*>', '\n', html, flags=re.IGNORECASE)
|
||||||
|
# Remove all remaining tags
|
||||||
|
html = re.sub(r'<[^>]+>', '', html)
|
||||||
|
# Decode entities
|
||||||
|
html = html.replace('&', '&').replace('<', '<').replace('>', '>').replace(' ', ' ')
|
||||||
|
# Clean up whitespace
|
||||||
|
lines = [line.strip() for line in html.split('\n')]
|
||||||
|
lines = [line for line in lines if line]
|
||||||
|
return '\n'.join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python3 scripts/extract_event_from_url.py <URL>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
url = sys.argv[1]
|
||||||
|
print(f"Fetching: {url}")
|
||||||
|
|
||||||
|
# Fetch page
|
||||||
|
html = fetch_page(url)
|
||||||
|
|
||||||
|
# Extract text
|
||||||
|
text = extract_text_simple(html)
|
||||||
|
text_path = '/tmp/event_extract_text.md'
|
||||||
|
with open(text_path, 'w') as f:
|
||||||
|
f.write(f"# Source: {url}\n\n{text}")
|
||||||
|
print(f"Text saved: {text_path} ({len(text)} chars)")
|
||||||
|
|
||||||
|
# Extract and download images
|
||||||
|
parser = ImageExtractor(url)
|
||||||
|
parser.feed(html)
|
||||||
|
|
||||||
|
print(f"Found {len(parser.images)} candidate images")
|
||||||
|
|
||||||
|
downloaded = []
|
||||||
|
for i, img in enumerate(parser.images):
|
||||||
|
ext = os.path.splitext(urlparse(img['url']).path)[1] or '.jpg'
|
||||||
|
if ext.lower() not in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
|
||||||
|
ext = '.jpg'
|
||||||
|
filepath = f'/tmp/event_extract_img_{i}{ext}'
|
||||||
|
print(f" Downloading: {img['url'][:80]}...")
|
||||||
|
if download_image(img['url'], filepath):
|
||||||
|
size_kb = os.path.getsize(filepath) / 1024
|
||||||
|
downloaded.append({
|
||||||
|
'path': filepath,
|
||||||
|
'url': img['url'],
|
||||||
|
'alt': img['alt'],
|
||||||
|
'size_kb': round(size_kb, 1),
|
||||||
|
})
|
||||||
|
print(f" Saved: {filepath} ({size_kb:.0f} KB)")
|
||||||
|
|
||||||
|
print(f"\nResults:")
|
||||||
|
print(f" Page text: {text_path}")
|
||||||
|
print(f" Images downloaded: {len(downloaded)}")
|
||||||
|
for d in downloaded:
|
||||||
|
print(f" {d['path']} ({d['size_kb']} KB) — {d['alt'][:60] if d['alt'] else 'no alt'}")
|
||||||
|
|
||||||
|
if downloaded:
|
||||||
|
print(f"\nClaude can now read these images with the Read tool to extract")
|
||||||
|
print(f"location, times, and other visual-only information.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Loading…
Reference in New Issue
Block a user