nordabiz/update_social_media.py
Maciej Pienczyn 986360f7d5 feat: Add URL normalization and inline audit sections
- Add normalize_social_url() function to database.py to prevent
  www vs non-www duplicates in social media records
- Update update_social_media.py to normalize URLs before insert
- Update social_media_audit.py to normalize URLs before insert
- Add inline GBP Audit section to company profile
- Add inline Social Media Audit section to company profile
- Add inline IT Audit section to company profile

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 23:07:03 +01:00

194 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""
Update Social Media profiles in NordaBiz database.
Reads from social_media_found.json and updates company_social_media table.
Usage:
python update_social_media.py [--dry-run]
"""
import os
import json
import sys
from datetime import datetime
# Database connection: Use environment variable DATABASE_URL
# WARNING: DATABASE_URL must be set before running this script.
# Example: export DATABASE_URL='postgresql://nordabiz_app:PASSWORD@localhost:5432/nordabiz'
# The database module will fall back to a safe placeholder if not set.
# NEVER commit real credentials to version control (CWE-798).
from database import SessionLocal, Company, CompanySocialMedia, normalize_social_url
from sqlalchemy import func
PLATFORMS = ['facebook', 'instagram', 'youtube', 'linkedin', 'tiktok', 'twitter']
def load_social_media_data():
"""Load social media data from JSON file."""
with open('social_media_found.json', 'r') as f:
data = json.load(f)
return data['results']
def update_social_media(dry_run=False):
"""Update social media profiles in database."""
db = SessionLocal()
try:
results = load_social_media_data()
stats = {
'companies_processed': 0,
'profiles_added': 0,
'profiles_updated': 0,
'profiles_unchanged': 0,
'errors': 0
}
changes = []
for item in results:
company_id = item['company_id']
company_name = item['name']
stats['companies_processed'] += 1
# Check if company exists
company = db.query(Company).filter(Company.id == company_id).first()
if not company:
print(f" WARNING: Company ID {company_id} ({company_name}) not found in database")
stats['errors'] += 1
continue
for platform in PLATFORMS:
url = item.get(platform)
if not url:
continue
# Normalize URL to prevent www vs non-www duplicates
url = normalize_social_url(url, platform)
# Check if profile already exists for this platform
existing = db.query(CompanySocialMedia).filter(
CompanySocialMedia.company_id == company_id,
CompanySocialMedia.platform == platform
).first()
if existing:
# Normalize existing URL for comparison
existing_normalized = normalize_social_url(existing.url, platform)
if existing_normalized != url:
old_url = existing.url
if not dry_run:
existing.url = url
existing.verified_at = datetime.now()
existing.source = 'web_search_2025-12-29'
changes.append({
'company_id': company_id,
'company_name': company_name,
'platform': platform,
'action': 'updated',
'old_url': old_url,
'new_url': url
})
stats['profiles_updated'] += 1
else:
stats['profiles_unchanged'] += 1
else:
if not dry_run:
new_profile = CompanySocialMedia(
company_id=company_id,
platform=platform,
url=url,
verified_at=datetime.now(),
source='web_search_2025-12-29',
is_valid=True
)
db.add(new_profile)
changes.append({
'company_id': company_id,
'company_name': company_name,
'platform': platform,
'action': 'added',
'url': url
})
stats['profiles_added'] += 1
if not dry_run:
db.commit()
print("\n=== Changes committed to database ===")
else:
print("\n=== DRY RUN - No changes made ===")
# Print statistics
print(f"\n=== Statistics ===")
print(f"Companies processed: {stats['companies_processed']}")
print(f"Profiles added: {stats['profiles_added']}")
print(f"Profiles updated: {stats['profiles_updated']}")
print(f"Profiles unchanged: {stats['profiles_unchanged']}")
print(f"Errors: {stats['errors']}")
# Print changes summary
if changes:
print(f"\n=== Changes Summary ===")
for change in changes:
if change['action'] == 'added':
print(f" + {change['company_name']} | {change['platform']}: {change['url']}")
else:
print(f" ~ {change['company_name']} | {change['platform']}: {change['old_url']} -> {change['new_url']}")
return stats, changes
except Exception as e:
db.rollback()
print(f"ERROR: {e}")
raise
finally:
db.close()
def print_summary_table(changes):
"""Print a summary table of all social media by platform."""
print("\n" + "="*80)
print("SOCIAL MEDIA SUMMARY - NordaBiz 2025-12-29")
print("="*80)
with open('social_media_found.json', 'r') as f:
data = json.load(f)
results = data['results']
# Count by platform
platform_counts = {p: 0 for p in PLATFORMS}
for item in results:
for platform in PLATFORMS:
if item.get(platform):
platform_counts[platform] += 1
print("\nPlatform Statistics:")
print("-" * 40)
for platform, count in sorted(platform_counts.items(), key=lambda x: -x[1]):
bar = "#" * (count // 2)
print(f" {platform:12} | {count:2} | {bar}")
total_profiles = sum(platform_counts.values())
companies_with_any = sum(1 for r in results if any(r.get(p) for p in PLATFORMS))
companies_without = len(results) - companies_with_any
print(f"\nTotal profiles found: {total_profiles}")
print(f"Companies with social media: {companies_with_any}")
print(f"Companies without social media: {companies_without}")
if __name__ == '__main__':
dry_run = '--dry-run' in sys.argv
print("="*60)
print("NordaBiz Social Media Update")
print("="*60)
if dry_run:
print("MODE: Dry run (no changes will be made)")
else:
print("MODE: Live update")
stats, changes = update_social_media(dry_run=dry_run)
print_summary_table(changes)