nordabiz/blueprints/messages/link_preview.py

"""
Link Preview — fetch Open Graph metadata for URLs in messages.
"""

import re
import logging
import requests
from html.parser import HTMLParser
from urllib.parse import urlparse

logger = logging.getLogger(__name__)

URL_REGEX = re.compile(r'https?://[^\s<>"\']+')
INTERNAL_DOMAINS = ['nordabiznes.pl', 'staging.nordabiznes.pl', 'localhost']


class OGParser(HTMLParser):
    """Parse og: meta tags and title from HTML head."""
    def __init__(self):
        super().__init__()
        self.og = {}
        self.title = None
        self._in_title = False
        self._title_data = []

    def handle_starttag(self, tag, attrs):
        if tag == 'meta':
            d = dict(attrs)
            prop = d.get('property', '') or d.get('name', '')
            content = d.get('content', '')
            if prop in ('og:title', 'og:description', 'og:image'):
                self.og[prop.replace('og:', '')] = content
            elif prop == 'description' and 'description' not in self.og:
                self.og['description'] = content
        elif tag == 'title':
            self._in_title = True

    def handle_data(self, data):
        if self._in_title:
            self._title_data.append(data)

    def handle_endtag(self, tag):
        if tag == 'title':
            self._in_title = False
            self.title = ''.join(self._title_data).strip()


def fetch_link_preview(text):
    """Extract first URL from text and fetch OG metadata. Returns dict or None."""
    # Strip HTML tags for URL detection
    clean = re.sub(r'<[^>]+>', '', text or '')
    urls = URL_REGEX.findall(clean)
    if not urls:
        return None

    url = urls[0]

    # Skip internal links
    parsed = urlparse(url)
    if parsed.hostname and any(parsed.hostname.endswith(d) for d in INTERNAL_DOMAINS):
        return None

    try:
        resp = requests.get(url, timeout=3, headers={
            'User-Agent': 'NordaBiznes/1.0 (Link Preview)'
        }, allow_redirects=True)
        resp.raise_for_status()

        content_type = resp.headers.get('content-type', '')
        if 'text/html' not in content_type:
            return None

        html = resp.text[:100_000]
        parser = OGParser()
        parser.feed(html)

        title = parser.og.get('title') or parser.title
        if not title:
            return None

        return {
            'url': url,
            'title': title[:200],
            'description': (parser.og.get('description') or '')[:300],
            'image': parser.og.get('image', ''),
        }
    except Exception as e:
        logger.debug(f"Link preview failed for {url}: {e}")
        return None