nordabiz/database/schema.sql
2026-01-01 14:01:49 +01:00

425 lines
15 KiB
PL/PgSQL

-- Norda Biznes - PostgreSQL Database Schema
-- Version: 1.0
-- Created: 2025-11-23
-- Description: Complete database structure for company directory
-- Enable UUID extension
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
-- ============================================================
-- LOOKUP TABLES (Reference Data)
-- ============================================================
-- Categories for companies
CREATE TABLE categories (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE,
description TEXT,
icon VARCHAR(50),
sort_order INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
-- Data quality levels
CREATE TYPE data_quality_level AS ENUM ('basic', 'partial', 'complete', 'verified');
-- Company status
CREATE TYPE company_status AS ENUM ('active', 'inactive', 'pending', 'archived');
-- ============================================================
-- MAIN COMPANIES TABLE
-- ============================================================
CREATE TABLE companies (
id SERIAL PRIMARY KEY,
-- Basic Info
name VARCHAR(255) NOT NULL,
legal_name VARCHAR(255),
slug VARCHAR(255) NOT NULL UNIQUE,
-- Category
category_id INTEGER REFERENCES categories(id),
-- Descriptions
description_short TEXT,
description_full TEXT,
-- Legal Identifiers
nip VARCHAR(10) UNIQUE,
regon VARCHAR(14),
krs VARCHAR(10),
-- Contact Info
website VARCHAR(500),
email VARCHAR(255),
phone VARCHAR(50),
-- Address (denormalized for simplicity)
address_street VARCHAR(255),
address_city VARCHAR(100),
address_postal VARCHAR(10),
address_full TEXT,
-- Business Data
year_established INTEGER,
employees_count INTEGER,
capital_amount DECIMAL(15, 2),
-- Status & Quality
status company_status DEFAULT 'active',
data_quality data_quality_level DEFAULT 'basic',
-- Metadata
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
researched_at TIMESTAMP,
verified_at TIMESTAMP,
-- Search optimization
search_vector tsvector,
-- Constraints
CONSTRAINT valid_nip CHECK (nip ~ '^\d{10}$' OR nip IS NULL),
CONSTRAINT valid_email CHECK (email ~* '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$' OR email IS NULL)
);
-- Indexes for performance
CREATE INDEX idx_companies_category ON companies(category_id);
CREATE INDEX idx_companies_nip ON companies(nip);
CREATE INDEX idx_companies_status ON companies(status);
CREATE INDEX idx_companies_quality ON companies(data_quality);
CREATE INDEX idx_companies_search ON companies USING gin(search_vector);
CREATE INDEX idx_companies_slug ON companies(slug);
-- Full-text search trigger
CREATE OR REPLACE FUNCTION companies_search_trigger() RETURNS trigger AS $$
BEGIN
NEW.search_vector :=
setweight(to_tsvector('polish', COALESCE(NEW.name, '')), 'A') ||
setweight(to_tsvector('polish', COALESCE(NEW.description_short, '')), 'B') ||
setweight(to_tsvector('polish', COALESCE(NEW.description_full, '')), 'C');
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER tsvector_update BEFORE INSERT OR UPDATE
ON companies FOR EACH ROW EXECUTE FUNCTION companies_search_trigger();
-- ============================================================
-- SERVICES (Many-to-Many)
-- ============================================================
CREATE TABLE services (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL UNIQUE,
slug VARCHAR(255) NOT NULL UNIQUE,
description TEXT,
icon VARCHAR(50),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE company_services (
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
service_id INTEGER REFERENCES services(id) ON DELETE CASCADE,
is_primary BOOLEAN DEFAULT FALSE,
added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (company_id, service_id)
);
CREATE INDEX idx_company_services_company ON company_services(company_id);
CREATE INDEX idx_company_services_service ON company_services(service_id);
-- ============================================================
-- COMPETENCIES (Many-to-Many)
-- ============================================================
CREATE TABLE competencies (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL UNIQUE,
slug VARCHAR(255) NOT NULL UNIQUE,
category VARCHAR(100), -- e.g., "Technology", "Certification", "Skill"
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE company_competencies (
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
competency_id INTEGER REFERENCES competencies(id) ON DELETE CASCADE,
level VARCHAR(50), -- e.g., "Expert", "Advanced", "Certified"
added_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (company_id, competency_id)
);
CREATE INDEX idx_company_competencies_company ON company_competencies(company_id);
CREATE INDEX idx_company_competencies_competency ON company_competencies(competency_id);
-- ============================================================
-- CERTIFICATIONS
-- ============================================================
CREATE TABLE certifications (
id SERIAL PRIMARY KEY,
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
issuer VARCHAR(255),
certificate_number VARCHAR(100),
issue_date DATE,
expiry_date DATE,
is_active BOOLEAN DEFAULT TRUE,
document_url VARCHAR(500),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_certifications_company ON certifications(company_id);
CREATE INDEX idx_certifications_active ON certifications(is_active);
-- ============================================================
-- AWARDS
-- ============================================================
CREATE TABLE awards (
id SERIAL PRIMARY KEY,
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
name VARCHAR(255) NOT NULL,
issuer VARCHAR(255),
year INTEGER,
description TEXT,
rank VARCHAR(50), -- e.g., "Gold", "Diamond", "1st Place"
url VARCHAR(500),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_awards_company ON awards(company_id);
CREATE INDEX idx_awards_year ON awards(year);
-- ============================================================
-- FINANCIAL DATA (Historical)
-- ============================================================
CREATE TABLE financial_data (
id SERIAL PRIMARY KEY,
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
year INTEGER NOT NULL,
revenue DECIMAL(15, 2),
profit DECIMAL(15, 2),
assets DECIMAL(15, 2),
liabilities DECIMAL(15, 2),
equity DECIMAL(15, 2),
employees INTEGER,
currency VARCHAR(3) DEFAULT 'PLN',
source VARCHAR(255),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(company_id, year)
);
CREATE INDEX idx_financial_company ON financial_data(company_id);
CREATE INDEX idx_financial_year ON financial_data(year);
-- ============================================================
-- SOCIAL MEDIA
-- ============================================================
CREATE TABLE social_media (
id SERIAL PRIMARY KEY,
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
platform VARCHAR(50) NOT NULL, -- 'facebook', 'linkedin', 'instagram', 'twitter', 'youtube'
url VARCHAR(500) NOT NULL,
follower_count INTEGER,
is_verified BOOLEAN DEFAULT FALSE,
last_checked TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(company_id, platform)
);
CREATE INDEX idx_social_company ON social_media(company_id);
CREATE INDEX idx_social_platform ON social_media(platform);
-- ============================================================
-- DATA SOURCES (Tracking)
-- ============================================================
CREATE TABLE data_sources (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL UNIQUE,
url VARCHAR(500),
type VARCHAR(50), -- 'website', 'registry', 'api', 'manual'
reliability_score INTEGER DEFAULT 5, -- 1-5 stars
is_active BOOLEAN DEFAULT TRUE,
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE company_data_sources (
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
source_id INTEGER REFERENCES data_sources(id) ON DELETE CASCADE,
collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
fields_collected TEXT[], -- Array of field names collected from this source
PRIMARY KEY (company_id, source_id)
);
CREATE INDEX idx_company_sources_company ON company_data_sources(company_id);
CREATE INDEX idx_company_sources_source ON company_data_sources(source_id);
-- ============================================================
-- COLLECTION LOG (Agent Activity)
-- ============================================================
CREATE TABLE collection_log (
id SERIAL PRIMARY KEY,
company_id INTEGER REFERENCES companies(id) ON DELETE CASCADE,
action VARCHAR(50) NOT NULL, -- 'created', 'updated', 'verified', 'enriched'
status VARCHAR(50) NOT NULL, -- 'success', 'partial', 'failed'
source VARCHAR(100),
fields_updated TEXT[],
error_message TEXT,
duration_ms INTEGER,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_collection_log_company ON collection_log(company_id);
CREATE INDEX idx_collection_log_created ON collection_log(created_at);
-- ============================================================
-- INITIAL REFERENCE DATA
-- ============================================================
-- Insert categories
INSERT INTO categories (name, slug, description, sort_order) VALUES
('IT', 'it', 'Technologie informacyjne, oprogramowanie, usługi IT', 1),
('Produkcja', 'produkcja', 'Firmy produkcyjne, wytwórcze', 2),
('Handel', 'handel', 'Firmy handlowe, dystrybucja', 3),
('Usługi', 'uslugi', 'Usługi dla biznesu i konsumentów', 4),
('Budownictwo', 'budownictwo', 'Firmy budowlane, deweloperzy', 5),
('Doradztwo', 'doradztwo', 'Doradztwo biznesowe, prawne, finansowe', 6),
('Media', 'media', 'Media, marketing, reklama', 7),
('Finanse', 'finanse', 'Usługi finansowe, bankowość', 8),
('Inne', 'inne', 'Pozostałe branże', 99);
-- Insert common data sources
INSERT INTO data_sources (name, url, type, reliability_score, description) VALUES
('ALEO.com', 'https://aleo.com', 'registry', 5, 'Oficjalny rejestr firm Polski - dane urzędowe'),
('Google Search', 'https://google.com', 'search', 4, 'Wyszukiwanie stron WWW firm'),
('Company Website', NULL, 'website', 5, 'Oficjalna strona internetowa firmy'),
('LinkedIn', 'https://linkedin.com', 'social', 3, 'Profile firmowe LinkedIn'),
('Facebook', 'https://facebook.com', 'social', 3, 'Strony firmowe Facebook'),
('norda-biznes.info', 'https://norda-biznes.info', 'website', 2, 'Portal Norda Biznes (tymczasowo nieaktywny)'),
('Manual Research', NULL, 'manual', 4, 'Ręcznie zebrane dane podczas researchu');
-- ============================================================
-- USEFUL VIEWS
-- ============================================================
-- Complete company view with all relations
CREATE VIEW v_companies_full AS
SELECT
c.*,
cat.name as category_name,
cat.slug as category_slug,
-- Count relations
(SELECT COUNT(*) FROM company_services WHERE company_id = c.id) as services_count,
(SELECT COUNT(*) FROM company_competencies WHERE company_id = c.id) as competencies_count,
(SELECT COUNT(*) FROM certifications WHERE company_id = c.id) as certifications_count,
(SELECT COUNT(*) FROM awards WHERE company_id = c.id) as awards_count,
-- Latest financial data
(SELECT revenue FROM financial_data WHERE company_id = c.id ORDER BY year DESC LIMIT 1) as latest_revenue,
(SELECT year FROM financial_data WHERE company_id = c.id ORDER BY year DESC LIMIT 1) as latest_revenue_year
FROM companies c
LEFT JOIN categories cat ON c.category_id = cat.id;
-- Data quality statistics
CREATE VIEW v_data_quality_stats AS
SELECT
status,
data_quality,
COUNT(*) as count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM companies
GROUP BY status, data_quality
ORDER BY status, data_quality;
-- Companies missing critical data
CREATE VIEW v_companies_incomplete AS
SELECT
id,
name,
CASE WHEN nip IS NULL THEN 'missing_nip' END as missing_nip,
CASE WHEN website IS NULL THEN 'missing_website' END as missing_website,
CASE WHEN email IS NULL THEN 'missing_email' END as missing_email,
CASE WHEN phone IS NULL THEN 'missing_phone' END as missing_phone,
data_quality,
last_updated
FROM companies
WHERE nip IS NULL OR website IS NULL OR email IS NULL OR phone IS NULL
ORDER BY data_quality, last_updated;
-- ============================================================
-- HELPER FUNCTIONS
-- ============================================================
-- Function to calculate data quality score
CREATE OR REPLACE FUNCTION calculate_data_quality(company_id_param INTEGER)
RETURNS data_quality_level AS $$
DECLARE
score INTEGER := 0;
c RECORD;
BEGIN
SELECT * INTO c FROM companies WHERE id = company_id_param;
-- Required fields (5 points each)
IF c.nip IS NOT NULL THEN score := score + 5; END IF;
IF c.website IS NOT NULL THEN score := score + 5; END IF;
IF c.email IS NOT NULL THEN score := score + 5; END IF;
IF c.phone IS NOT NULL THEN score := score + 5; END IF;
-- High priority fields (3 points each)
IF c.legal_name IS NOT NULL THEN score := score + 3; END IF;
IF c.regon IS NOT NULL THEN score := score + 3; END IF;
IF c.krs IS NOT NULL THEN score := score + 3; END IF;
IF c.address_full IS NOT NULL THEN score := score + 3; END IF;
IF c.description_full IS NOT NULL THEN score := score + 3; END IF;
-- Services and competencies
IF (SELECT COUNT(*) FROM company_services WHERE company_id = company_id_param) >= 3 THEN
score := score + 3;
END IF;
IF (SELECT COUNT(*) FROM company_competencies WHERE company_id = company_id_param) >= 3 THEN
score := score + 3;
END IF;
-- Return quality level based on score
IF score >= 35 THEN RETURN 'complete'::data_quality_level;
ELSIF score >= 20 THEN RETURN 'partial'::data_quality_level;
ELSIF score >= 10 THEN RETURN 'basic'::data_quality_level;
ELSE RETURN 'basic'::data_quality_level;
END IF;
END;
$$ LANGUAGE plpgsql;
-- Function to update data quality for all companies
CREATE OR REPLACE FUNCTION update_all_data_quality()
RETURNS void AS $$
BEGIN
UPDATE companies
SET data_quality = calculate_data_quality(id);
END;
$$ LANGUAGE plpgsql;
-- ============================================================
-- GRANTS (adjust for your user)
-- ============================================================
-- Create application user
-- CREATE USER nordabiz_app WITH PASSWORD 'your_secure_password';
-- Grant permissions
-- GRANT CONNECT ON DATABASE nordabiz TO nordabiz_app;
-- GRANT USAGE ON SCHEMA public TO nordabiz_app;
-- GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO nordabiz_app;
-- GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO nordabiz_app;
-- ============================================================
-- END OF SCHEMA
-- ============================================================