빅쿼리 배열과 구조체
복합 데이터 타입 활용
BigQuery에서 배열(ARRAY)과 구조체(STRUCT)를 활용한 중첩 데이터 처리 방법을 다루는 종합 가이드입니다.
목차
1. 배열과 구조체 개념
1.1 배열(ARRAY)이란?
배열은 동일한 타입의 값들을 순서대로 저장하는 데이터 구조입니다.
-- 기본 배열 생성
SELECT
[1, 2, 3, 4, 5] as numbers,
['apple', 'banana', 'cherry'] as fruits,
[DATE '2024-01-01', DATE '2024-01-02'] as dates;
1.2 구조체(STRUCT)란?
구조체는 서로 다른 타입의 필드들을 가진 복합 데이터 타입입니다.
-- 기본 구조체 생성
SELECT
STRUCT('John' as name, 25 as age, 'Engineer' as job) as person,
STRUCT(
'Product A' as name,
100.0 as price,
['electronics', 'gadget'] as categories
) as product;
1.3 중첩 구조의 장점
-- 관계형 모델 (정규화)
-- customers 테이블
-- customer_id | name | email
-- orders 테이블
-- order_id | customer_id | amount
-- order_items 테이블
-- item_id | order_id | product | quantity
-- BigQuery 중첩 모델 (비정규화)
SELECT
customer_id,
customer_info.name,
customer_info.email,
orders -- 배열
FROM customer_orders_nested
WHERE customer_id = 123;
2. 배열 기본 사용법
2.1 배열 생성
-- 리터럴로 배열 생성
SELECT
[1, 2, 3] as simple_array,
['a', 'b', 'c'] as string_array,
[true, false, true] as bool_array;
-- 쿼리 결과로 배열 생성
SELECT
customer_id,
ARRAY_AGG(product_name) as purchased_products,
ARRAY_AGG(order_amount) as order_amounts
FROM orders
GROUP BY customer_id;
-- 조건부 배열 생성
SELECT
customer_id,
ARRAY_AGG(product_name ORDER BY order_date DESC LIMIT 5) as recent_products
FROM orders
GROUP BY customer_id;
2.2 배열 접근
-- 인덱스로 접근 (0부터 시작)
WITH sample_data AS (
SELECT ['apple', 'banana', 'cherry', 'date'] as fruits
)
SELECT
fruits,
fruits[OFFSET(0)] as first_fruit, -- 'apple'
fruits[OFFSET(1)] as second_fruit, -- 'banana'
fruits[SAFE_OFFSET(10)] as safe_access -- NULL (안전한 접근)
FROM sample_data;
-- ORDINAL 사용 (1부터 시작)
SELECT
fruits,
fruits[ORDINAL(1)] as first_fruit, -- 'apple'
fruits[ORDINAL(2)] as second_fruit -- 'banana'
FROM (SELECT ['apple', 'banana', 'cherry'] as fruits);
2.3 배열 조건부 접근
-- 조건에 맞는 배열 요소 처리
WITH user_scores AS (
SELECT
'user1' as user_id,
[85, 92, 78, 96, 88] as test_scores
)
SELECT
user_id,
test_scores,
-- 특정 점수 이상 개수
(SELECT COUNT(*)
FROM UNNEST(test_scores) as score
WHERE score >= 90) as high_scores_count,
-- 최고점과 최저점
(SELECT MAX(score) FROM UNNEST(test_scores) as score) as max_score,
(SELECT MIN(score) FROM UNNEST(test_scores) as score) as min_score
FROM user_scores;
3. 구조체 기본 사용법
3.1 구조체 생성
-- 기본 구조체 생성
SELECT
STRUCT(
'John Doe' as full_name,
30 as age,
'john@email.com' as email,
['reading', 'swimming'] as hobbies
) as user_profile;
-- 테이블에서 구조체 생성
SELECT
customer_id,
STRUCT(
first_name,
last_name,
email,
phone
) as contact_info
FROM customers;
3.2 구조체 접근
-- 도트 표기법으로 접근
WITH user_data AS (
SELECT STRUCT(
'Alice' as name,
28 as age,
STRUCT(
'123 Main St' as street,
'New York' as city,
'NY' as state
) as address
) as user_info
)
SELECT
user_info.name, -- 'Alice'
user_info.age, -- 28
user_info.address.street, -- '123 Main St'
user_info.address.city -- 'New York'
FROM user_data;
3.3 구조체 배열
-- 구조체의 배열
WITH order_data AS (
SELECT
'ORDER123' as order_id,
[
STRUCT('Product A' as name, 2 as quantity, 29.99 as price),
STRUCT('Product B' as name, 1 as quantity, 49.99 as price),
STRUCT('Product C' as name, 3 as quantity, 19.99 as price)
] as items
)
SELECT
order_id,
items,
-- 총 아이템 수
ARRAY_LENGTH(items) as total_items,
-- 총 주문 금액
(SELECT SUM(item.quantity * item.price)
FROM UNNEST(items) as item) as total_amount
FROM order_data;
4. 중첩 구조 처리
4.1 복잡한 중첩 구조
-- 실제 e-commerce 데이터 예시
WITH ecommerce_data AS (
SELECT
'CUST001' as customer_id,
STRUCT(
'John Smith' as name,
'john.smith@email.com' as email,
STRUCT(
'123 Oak Street' as street,
'San Francisco' as city,
'CA' as state,
'94102' as zipcode
) as shipping_address
) as customer_info,
[
STRUCT(
'ORD001' as order_id,
DATE '2024-01-15' as order_date,
[
STRUCT('Laptop' as product, 1 as qty, 999.99 as price),
STRUCT('Mouse' as product, 2 as qty, 25.00 as price)
] as items,
STRUCT(
49.99 as subtotal_discount,
'SAVE50' as coupon_code
) as discount_info
),
STRUCT(
'ORD002' as order_id,
DATE '2024-02-01' as order_date,
[
STRUCT('Keyboard' as product, 1 as qty, 150.00 as price)
] as items,
STRUCT(
0.0 as subtotal_discount,
NULL as coupon_code
) as discount_info
)
] as orders
)
SELECT * FROM ecommerce_data;
4.2 중첩 데이터 쿼리
-- 위 데이터에서 정보 추출
WITH ecommerce_data AS (
-- 위와 동일한 데이터
)
SELECT
customer_id,
customer_info.name,
customer_info.shipping_address.city,
-- 전체 주문 수
ARRAY_LENGTH(orders) as total_orders,
-- 총 구매 금액
(SELECT SUM(
(SELECT SUM(item.qty * item.price)
FROM UNNEST(order.items) as item) - order.discount_info.subtotal_discount
) FROM UNNEST(orders) as order) as total_spent,
-- 구매한 모든 상품
ARRAY(
SELECT DISTINCT item.product
FROM UNNEST(orders) as order,
UNNEST(order.items) as item
) as all_products,
-- 최근 주문일
(SELECT MAX(order.order_date)
FROM UNNEST(orders) as order) as last_order_date
FROM ecommerce_data;
5. 배열 함수
5.1 기본 배열 함수
-- 배열 생성 및 조작 함수들
WITH sample_arrays AS (
SELECT
[1, 2, 3, 4, 5] as numbers,
['a', 'b', 'c', 'd'] as letters,
[10, 20, 30] as values
)
SELECT
-- 길이
ARRAY_LENGTH(numbers) as numbers_length,
-- 연결
ARRAY_CONCAT(numbers, values) as concatenated,
-- 요소 추가
ARRAY_CONCAT(numbers, [6, 7]) as extended,
-- 뒤집기
ARRAY_REVERSE(letters) as reversed_letters,
-- 슬라이싱 (서브배열)
numbers[OFFSET(1):OFFSET(3)] as slice_1_to_3
FROM sample_arrays;
5.2 배열 집계 함수
-- ARRAY_AGG 고급 사용법
SELECT
category,
-- 기본 집계
ARRAY_AGG(product_name) as all_products,
-- 정렬된 집계
ARRAY_AGG(product_name ORDER BY price DESC) as products_by_price_desc,
-- 제한된 집계
ARRAY_AGG(product_name ORDER BY sales_count DESC LIMIT 3) as top_3_products,
-- 조건부 집계
ARRAY_AGG(
CASE WHEN price > 100 THEN product_name ELSE NULL END
IGNORE NULLS
) as expensive_products,
-- DISTINCT 집계
ARRAY_AGG(DISTINCT brand) as brands
FROM products
GROUP BY category;
5.3 배열 변환 함수
-- 배열 변환 및 필터링
WITH data AS (
SELECT
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] as numbers
)
SELECT
numbers,
-- ARRAY_TRANSFORM 같은 기능 (서브쿼리 사용)
ARRAY(
SELECT num * 2
FROM UNNEST(numbers) as num
) as doubled,
-- 조건부 필터링
ARRAY(
SELECT num
FROM UNNEST(numbers) as num
WHERE num % 2 = 0
) as even_numbers,
-- 문자열로 변환
ARRAY(
SELECT CAST(num AS STRING)
FROM UNNEST(numbers) as num
) as number_strings,
-- 복잡한 변환
ARRAY(
SELECT STRUCT(num as original, num * num as squared)
FROM UNNEST(numbers) as num
WHERE num <= 5
) as number_squares
FROM data;
6. 구조체 함수
6.1 구조체 추출 함수
-- 구조체에서 정보 추출
WITH user_data AS (
SELECT [
STRUCT('John' as name, 25 as age, 'Engineer' as job, 75000 as salary),
STRUCT('Alice' as name, 30 as age, 'Manager' as job, 85000 as salary),
STRUCT('Bob' as name, 28 as age, 'Developer' as job, 70000 as salary)
] as employees
)
SELECT
-- 특정 필드만 추출
ARRAY(
SELECT emp.name
FROM UNNEST(employees) as emp
) as employee_names,
-- 조건부 필터링
ARRAY(
SELECT emp.name
FROM UNNEST(employees) as emp
WHERE emp.salary > 70000
) as high_earners,
-- 구조체 변환
ARRAY(
SELECT STRUCT(
emp.name,
emp.job,
CASE
WHEN emp.salary >= 80000 THEN 'Senior'
WHEN emp.salary >= 70000 THEN 'Mid'
ELSE 'Junior'
END as level
)
FROM UNNEST(employees) as emp
) as employee_levels
FROM user_data;
6.2 구조체 업데이트
-- 구조체 필드 업데이트 (재생성)
WITH original_data AS (
SELECT STRUCT(
'John Doe' as name,
'john@email.com' as email,
25 as age,
'Developer' as position
) as employee
)
SELECT
-- 나이 업데이트
STRUCT(
employee.name,
employee.email,
26 as age, -- 업데이트된 값
employee.position
) as updated_employee,
-- 새 필드 추가
STRUCT(
employee.name,
employee.email,
employee.age,
employee.position,
DATE '2024-01-01' as hire_date -- 새 필드
) as enhanced_employee
FROM original_data;
7. UNNEST와 배열 평면화
7.1 기본 UNNEST
-- 배열을 행으로 변환
WITH sample_data AS (
SELECT
'user1' as user_id,
['apple', 'banana', 'cherry'] as fruits
UNION ALL
SELECT
'user2' as user_id,
['orange', 'grape'] as fruits
)
SELECT
user_id,
fruit
FROM sample_data,
UNNEST(fruits) as fruit;
-- 결과:
-- user1 | apple
-- user1 | banana
-- user1 | cherry
-- user2 | orange
-- user2 | grape
7.2 UNNEST WITH OFFSET
-- 배열 인덱스와 함께 사용
WITH sample_data AS (
SELECT ['first', 'second', 'third', 'fourth'] as items
)
SELECT
items,
item_value,
item_position,
item_position + 1 as item_number
FROM sample_data,
UNNEST(items) as item_value WITH OFFSET as item_position
ORDER BY item_position;
7.3 복잡한 UNNEST 패턴
-- 구조체 배열의 UNNEST
WITH order_data AS (
SELECT
'ORDER123' as order_id,
DATE '2024-01-15' as order_date,
[
STRUCT('Product A' as name, 2 as quantity, 29.99 as unit_price),
STRUCT('Product B' as name, 1 as quantity, 49.99 as unit_price),
STRUCT('Product C' as name, 3 as quantity, 19.99 as unit_price)
] as items
)
SELECT
order_id,
order_date,
item.name as product_name,
item.quantity,
item.unit_price,
item.quantity * item.unit_price as line_total,
-- 윈도우 함수와 함께 사용
ROW_NUMBER() OVER (ORDER BY item.unit_price DESC) as price_rank,
SUM(item.quantity * item.unit_price) OVER () as order_total
FROM order_data,
UNNEST(items) as item
ORDER BY item.unit_price DESC;
7.4 다중 배열 UNNEST
-- 여러 배열을 동시에 UNNEST (길이가 같은 경우)
WITH multi_array_data AS (
SELECT
'student1' as student_id,
['Math', 'Science', 'History'] as subjects,
[95, 88, 92] as scores,
['A', 'B+', 'A-'] as grades
)
SELECT
student_id,
subject,
score,
grade
FROM multi_array_data,
UNNEST(subjects) as subject WITH OFFSET pos1,
UNNEST(scores) as score WITH OFFSET pos2,
UNNEST(grades) as grade WITH OFFSET pos3
WHERE pos1 = pos2 AND pos2 = pos3; -- 같은 위치끼리 매칭
8. 실제 활용 사례
8.1 웹 분석 - 사용자 이벤트 추적
-- 사용자 세션 데이터 분석
WITH user_sessions AS (
SELECT
session_id,
user_id,
session_start,
[
STRUCT('page_view' as event_type, '/home' as page, TIMESTAMP '2024-01-01 10:00:00' as timestamp),
STRUCT('page_view' as event_type, '/products' as page, TIMESTAMP '2024-01-01 10:02:00' as timestamp),
STRUCT('click' as event_type, 'add_to_cart' as page, TIMESTAMP '2024-01-01 10:03:00' as timestamp),
STRUCT('page_view' as event_type, '/checkout' as page, TIMESTAMP '2024-01-01 10:05:00' as timestamp),
STRUCT('purchase' as event_type, 'completed' as page, TIMESTAMP '2024-01-01 10:07:00' as timestamp)
] as events
FROM raw_sessions
)
SELECT
session_id,
user_id,
-- 세션 통계
ARRAY_LENGTH(events) as total_events,
-- 이벤트 타입별 개수
(SELECT COUNT(*) FROM UNNEST(events) as e WHERE e.event_type = 'page_view') as page_views,
(SELECT COUNT(*) FROM UNNEST(events) as e WHERE e.event_type = 'click') as clicks,
(SELECT COUNT(*) FROM UNNEST(events) as e WHERE e.event_type = 'purchase') as purchases,
-- 세션 지속 시간
TIMESTAMP_DIFF(
(SELECT MAX(e.timestamp) FROM UNNEST(events) as e),
session_start,
SECOND
) as session_duration_seconds,
-- 방문한 페이지 목록 (중복 제거)
ARRAY(
SELECT DISTINCT e.page
FROM UNNEST(events) as e
WHERE e.event_type = 'page_view'
ORDER BY e.page
) as visited_pages,
-- 구매 전환 여부
EXISTS(SELECT 1 FROM UNNEST(events) as e WHERE e.event_type = 'purchase') as converted
FROM user_sessions;
8.2 제품 추천 시스템
-- 사용자별 제품 선호도 및 추천
WITH user_preferences AS (
SELECT
user_id,
ARRAY_AGG(
STRUCT(
product_id,
product_name,
category,
rating,
purchase_count,
last_purchase_date
) ORDER BY rating DESC, purchase_count DESC
) as rated_products
FROM user_product_ratings
WHERE rating >= 4 -- 높은 평점만
GROUP BY user_id
),
category_preferences AS (
SELECT
user_id,
rated_products,
-- 카테고리별 선호도 계산
ARRAY(
SELECT STRUCT(
category,
COUNT(*) as product_count,
ROUND(AVG(product.rating), 2) as avg_rating,
SUM(product.purchase_count) as total_purchases
)
FROM UNNEST(rated_products) as product
GROUP BY category
ORDER BY avg_rating DESC, total_purchases DESC
) as category_preferences
FROM user_preferences
)
SELECT
user_id,
-- 상위 3개 선호 카테고리
ARRAY(
SELECT cat.category
FROM UNNEST(category_preferences) as cat
LIMIT 3
) as top_categories,
-- 최근 구매 제품들
ARRAY(
SELECT product.product_name
FROM UNNEST(rated_products) as product
WHERE product.last_purchase_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 30 DAY)
ORDER BY product.last_purchase_date DESC
) as recent_purchases,
-- 추천 점수가 높은 상위 5개 제품
ARRAY(
SELECT STRUCT(
product.product_name,
product.category,
product.rating,
-- 추천 점수: 평점 * 구매 횟수 * 카테고리 가중치
ROUND(product.rating * product.purchase_count *
COALESCE((
SELECT cat.avg_rating
FROM UNNEST(category_preferences) as cat
WHERE cat.category = product.category
), 1), 2) as recommendation_score
)
FROM UNNEST(rated_products) as product
ORDER BY recommendation_score DESC
LIMIT 5
) as top_recommendations
FROM category_preferences;
8.3 IoT 센서 데이터 분석
-- IoT 디바이스 센서 데이터 분석
WITH sensor_data AS (
SELECT
device_id,
DATE '2024-01-01' as measurement_date,
[
STRUCT('temperature' as sensor_type, [20.5, 21.2, 22.1, 21.8, 20.9] as hourly_readings),
STRUCT('humidity' as sensor_type, [45.2, 46.8, 48.1, 47.3, 46.5] as hourly_readings),
STRUCT('pressure' as sensor_type, [1013.2, 1012.8, 1014.1, 1013.9, 1013.5] as hourly_readings)
] as sensors
)
SELECT
device_id,
measurement_date,
-- 센서별 통계
ARRAY(
SELECT STRUCT(
sensor.sensor_type,
ARRAY_LENGTH(sensor.hourly_readings) as reading_count,
ROUND((SELECT AVG(reading) FROM UNNEST(sensor.hourly_readings) as reading), 2) as avg_value,
ROUND((SELECT MIN(reading) FROM UNNEST(sensor.hourly_readings) as reading), 2) as min_value,
ROUND((SELECT MAX(reading) FROM UNNEST(sensor.hourly_readings) as reading), 2) as max_value,
ROUND((SELECT STDDEV(reading) FROM UNNEST(sensor.hourly_readings) as reading), 2) as stddev_value
)
FROM UNNEST(sensors) as sensor
) as sensor_stats,
-- 이상치 감지 (표준편차 2배 이상)
ARRAY(
SELECT STRUCT(
sensor.sensor_type,
reading,
reading_position,
'outlier' as flag
)
FROM UNNEST(sensors) as sensor,
UNNEST(sensor.hourly_readings) as reading WITH OFFSET reading_position
WHERE ABS(reading - (SELECT AVG(r) FROM UNNEST(sensor.hourly_readings) as r)) >
2 * (SELECT STDDEV(r) FROM UNNEST(sensor.hourly_readings) as r)
) as outliers
FROM sensor_data;
9. 성능 최적화
9.1 배열 크기 제한
-- ❌ 비효율적: 무제한 배열 집계
SELECT
customer_id,
ARRAY_AGG(order_id) as all_orders -- 수천 개 주문이 있을 수 있음
FROM orders
GROUP BY customer_id;
-- ✅ 효율적: 크기 제한
SELECT
customer_id,
ARRAY_AGG(order_id ORDER BY order_date DESC LIMIT 10) as recent_orders
FROM orders
GROUP BY customer_id;
9.2 적절한 필터링
-- ❌ 비효율적: 전체 배열 처리 후 필터링
WITH all_data AS (
SELECT
user_id,
ARRAY_AGG(
STRUCT(event_type, event_time, event_data)
) as all_events
FROM events
GROUP BY user_id
)
SELECT
user_id,
(SELECT COUNT(*) FROM UNNEST(all_events) as e WHERE e.event_type = 'purchase') as purchases
FROM all_data;
-- ✅ 효율적: 미리 필터링
SELECT
user_id,
ARRAY_AGG(
STRUCT(event_type, event_time, event_data)
) as purchase_events
FROM events
WHERE event_type = 'purchase' -- 미리 필터링
GROUP BY user_id;
9.3 중첩 구조 최적화
-- 파티션된 테이블에서 배열 처리 최적화
SELECT
customer_id,
order_date,
ARRAY_AGG(
STRUCT(product_id, quantity, price)
ORDER BY price DESC
) as order_items
FROM orders_partitioned
WHERE order_date = CURRENT_DATE() -- 파티션 프루닝
GROUP BY customer_id, order_date
HAVING COUNT(*) <= 50; -- 대용량 주문 제외
10. 모범 사례와 주의점
10.1 모범 사례
1. 적절한 배열 크기 유지
-- ✅ 좋은 예: 크기 제한과 정렬
SELECT
product_id,
ARRAY_AGG(
STRUCT(user_id, rating, review_text)
ORDER BY rating DESC, review_date DESC
LIMIT 20 -- 최대 20개 리뷰만
) as top_reviews
FROM product_reviews
WHERE rating >= 4 -- 고평점만
GROUP BY product_id;
2. 명확한 구조체 필드 이름
-- ✅ 좋은 예: 명확한 필드명
SELECT
STRUCT(
first_name as given_name,
last_name as family_name,
birth_date as date_of_birth,
phone_number as primary_phone
) as contact_details
FROM users;
3. 적절한 중첩 레벨
-- ✅ 좋은 예: 2-3 레벨 중첩
SELECT
user_id,
STRUCT(
personal_info.name,
personal_info.email,
preferences.categories -- 배열
) as user_summary
FROM user_profiles;
-- ❌ 피해야 할 예: 과도한 중첩
-- STRUCT(STRUCT(STRUCT(...))) - 5+ 레벨
10.2 주의점
1. NULL 처리
-- NULL 안전 배열 처리
SELECT
customer_id,
ARRAY_AGG(
CASE
WHEN order_amount IS NOT NULL THEN order_amount
END
IGNORE NULLS
) as valid_order_amounts
FROM orders
GROUP BY customer_id;
2. 배열 순서 보장
-- ❌ 잘못된 예: 순서 보장 없음
SELECT
ARRAY_AGG(product_name) as products
FROM products;
-- ✅ 올바른 예: 명시적 정렬
SELECT
ARRAY_AGG(product_name ORDER BY product_name) as products_alphabetical,
ARRAY_AGG(product_name ORDER BY popularity DESC) as products_by_popularity
FROM products;
3. 타입 일관성
-- ❌ 잘못된 예: 타입 불일치
SELECT [1, '2', 3.0] as mixed_array; -- 오류 발생
-- ✅ 올바른 예: 일관된 타입
SELECT [CAST(1 AS STRING), '2', CAST(3.0 AS STRING)] as string_array;
4. 메모리 사용량 고려
-- 큰 배열 처리 시 주의
SELECT
customer_id,
-- 대용량 데이터의 경우 샘플링 고려
ARRAY_AGG(
order_details ORDER BY RAND() LIMIT 100 -- 랜덤 샘플링
) as sample_orders
FROM large_orders_table
GROUP BY customer_id;
10.3 디버깅 팁
-- 배열/구조체 구조 확인
WITH debug_data AS (
SELECT
customer_id,
ARRAY_AGG(
STRUCT(order_id, order_date, total_amount)
ORDER BY order_date DESC
) as orders
FROM orders
GROUP BY customer_id
LIMIT 1
)
SELECT
customer_id,
ARRAY_LENGTH(orders) as order_count,
orders[OFFSET(0)] as first_order, -- 첫 번째 주문 구조 확인
TO_JSON_STRING(orders[OFFSET(0)]) as first_order_json -- JSON으로 구조 확인
FROM debug_data;
BigQuery의 배열과 구조체는 복잡한 중첩 데이터를 효율적으로 처리할 수 있는 강력한 기능입니다. 적절한 구조 설계와 성능 최적화를 통해 관계형 데이터베이스로는 어려운 분석을 간단하게 수행할 수 있습니다.