SQL Column Profiling
1. Count of NULL values:
SELECT COUNT(*) - COUNT(column_name) AS null_count
FROM table_name;
2. Distinct values and their counts:
SELECT column_name, COUNT(*)
FROM table_name
GROUP BY column_name;
3. Minimum and maximum values:
SELECT MIN(column_name) AS min_value, MAX(column_name) AS max_value
FROM table_name;
4. Average and standard deviation:
SELECT AVG(column_name) AS avg_value, STDDEV(column_name) AS stddev_value
FROM table_name;
5. String length distribution:
SELECT LENGTH(column_name) AS length, COUNT(*)
FROM table_nameGROUP BY LENGTH(column_name);
SQL Cross-Column Profiling
1. Correlation between two numeric columns:
SELECT CORR(column1, column2) AS correlation
FROM table_name;
2. Finding unique combinations of values across columns:
SELECT column1, column2, COUNT(*)
FROM table_name
GROUP BY column1, column2;
3. Detecting functional dependencies:
SELECT column1, column2, COUNT(DISTINCT column2) AS unique_values
FROM table_name
GROUP BY column1
HAVING COUNT(DISTINCT column2) = 1;
4. Checking for null values across multiple columns:
SELECT COUNT(*) AS null_count
FROM table_name
WHERE column1 IS NULL OR column2 IS NULL;
SQL Cross-Table Profiling
1. Foreign Key Relationships:
SELECTtc.table_schema,tc.table_name,kcu.column_name,ccu.table_schema AS foreign_table_schema,ccu.table_name AS foreign_table_name,ccu.column_name AS foreign_column_nameFROMinformation_schema.table_constraints AS tcJOIN information_schema.key_column_usage AS kcuON tc.constraint_name = kcu.constraint_nameAND tc.table_schema = kcu.table_schemaJOIN information_schema.constraint_column_usage AS ccuON ccu.constraint_name = tc.constraint_nameAND ccu.table_schema = tc.table_schemaWHERE tc.constraint_type = 'FOREIGN KEY';
2. Join Analysis:
SELECTt1.column1, t2.column2, COUNT(*)FROMtable1 t1JOIN table2 t2 ON t1.common_column = t2.common_columnGROUP BYt1.column1, t2.column2;
3. Referential Integrity Checks:
SELECTt1.common_columnFROMtable1 t1LEFT JOINtable2 t2 ON t1.common_column = t2.common_columnWHEREt2.common_column IS NULL;
SQL Data Rule Validation Profiling
1. Check for non-null values:
SELECT COUNT(*) AS null_countFROM table_nameWHERE column_name IS NULL;
2. Check for unique values:
SELECT column_name, COUNT(*)FROM table_nameGROUP BY column_nameHAVING COUNT(*) > 1;
3. Check for valid ranges:
SELECT *FROM table_nameWHERE column_name < min_value OR column_name > max_value;
4. Check for specific patterns (e.g., email format):
SELECT *FROM table_nameWHERE column_name NOT LIKE '%_@__%.__%';
5. Check for foreign key constraints:
SELECT t1.*FROM table1 t1LEFT JOIN table2 t2 ON t1.foreign_key = t2.primary_keyWHERE t2.primary_key IS NULL;
SQL Cardinality
1. Count Distinct Values:
SELECT COUNT(DISTINCT column_name) AS cardinalityFROM table_name;
2. High Cardinality Example:
SELECT column_name, COUNT(*) AS frequencyFROM table_nameGROUP BY column_nameORDER BY frequency DESC;
3. Low Cardinality Example:
SELECT column_name, COUNT(*) AS frequencyFROM table_nameGROUP BY column_nameHAVING COUNT(*) > 1ORDER BY frequency DESC;
References

.jpeg)
No comments:
Post a Comment