feat(messages): add link preview for URLs in messages
This commit is contained in:
parent
362ff74b91
commit
5182be0748
89
blueprints/messages/link_preview.py
Normal file
89
blueprints/messages/link_preview.py
Normal file
@ -0,0 +1,89 @@
|
||||
"""
|
||||
Link Preview — fetch Open Graph metadata for URLs in messages.
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
import requests
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urlparse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
URL_REGEX = re.compile(r'https?://[^\s<>"\']+')
|
||||
INTERNAL_DOMAINS = ['nordabiznes.pl', 'staging.nordabiznes.pl', 'localhost']
|
||||
|
||||
|
||||
class OGParser(HTMLParser):
|
||||
"""Parse og: meta tags and title from HTML head."""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.og = {}
|
||||
self.title = None
|
||||
self._in_title = False
|
||||
self._title_data = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'meta':
|
||||
d = dict(attrs)
|
||||
prop = d.get('property', '') or d.get('name', '')
|
||||
content = d.get('content', '')
|
||||
if prop in ('og:title', 'og:description', 'og:image'):
|
||||
self.og[prop.replace('og:', '')] = content
|
||||
elif prop == 'description' and 'description' not in self.og:
|
||||
self.og['description'] = content
|
||||
elif tag == 'title':
|
||||
self._in_title = True
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._in_title:
|
||||
self._title_data.append(data)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'title':
|
||||
self._in_title = False
|
||||
self.title = ''.join(self._title_data).strip()
|
||||
|
||||
|
||||
def fetch_link_preview(text):
|
||||
"""Extract first URL from text and fetch OG metadata. Returns dict or None."""
|
||||
# Strip HTML tags for URL detection
|
||||
clean = re.sub(r'<[^>]+>', '', text or '')
|
||||
urls = URL_REGEX.findall(clean)
|
||||
if not urls:
|
||||
return None
|
||||
|
||||
url = urls[0]
|
||||
|
||||
# Skip internal links
|
||||
parsed = urlparse(url)
|
||||
if parsed.hostname and any(parsed.hostname.endswith(d) for d in INTERNAL_DOMAINS):
|
||||
return None
|
||||
|
||||
try:
|
||||
resp = requests.get(url, timeout=3, headers={
|
||||
'User-Agent': 'NordaBiznes/1.0 (Link Preview)'
|
||||
}, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
content_type = resp.headers.get('content-type', '')
|
||||
if 'text/html' not in content_type:
|
||||
return None
|
||||
|
||||
html = resp.text[:100_000]
|
||||
parser = OGParser()
|
||||
parser.feed(html)
|
||||
|
||||
title = parser.og.get('title') or parser.title
|
||||
if not title:
|
||||
return None
|
||||
|
||||
return {
|
||||
'url': url,
|
||||
'title': title[:200],
|
||||
'description': (parser.og.get('description') or '')[:300],
|
||||
'image': parser.og.get('image', ''),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.debug(f"Link preview failed for {url}: {e}")
|
||||
return None
|
||||
Loading…
Reference in New Issue
Block a user