MDP (Markov Decision Process) · Tae Hyun Kim (Lowell)

MDP 요소	프라이싱 해석
상태 s	재고, 시간, 수요 이력, 경쟁 가격
행동 a	설정할 가격
보상 r	(가격 - 비용) × 판매량
전이 P	재고 감소, 시간 경과

Example

재고 연동 프라이싱

import numpy as np
from collections import defaultdict

class PricingMDP:
    def __init__(self, max_inventory, max_time, price_levels, demand_func, cost):
        self.max_inventory = max_inventory
        self.max_time = max_time
        self.price_levels = price_levels
        self.demand_func = demand_func  # demand_func(price, time) -> expected demand
        self.cost = cost

    def get_state(self, inventory, time):
        return (inventory, time)

    def transition(self, state, action):
        """
        상태 전이와 보상
        """
        inventory, time = state
        price = self.price_levels[action]

        # 수요 샘플링 (포아송)
        expected_demand = self.demand_func(price, time)
        demand = np.random.poisson(expected_demand)

        # 판매량 (재고 제한)
        sales = min(demand, inventory)

        # 보상
        reward = (price - self.cost) * sales

        # 새 상태
        new_inventory = inventory - sales
        new_time = time + 1

        done = (new_time >= self.max_time) or (new_inventory == 0)

        return self.get_state(new_inventory, new_time), reward, done

def value_iteration(mdp, gamma=0.99, theta=1e-6):
    """
    가치 반복으로 최적 정책 찾기
    """
    # 상태 공간 열거
    states = [(i, t) for i in range(mdp.max_inventory + 1)
              for t in range(mdp.max_time)]

    V = defaultdict(float)
    policy = defaultdict(int)

    while True:
        delta = 0
        for state in states:
            if state[0] == 0 or state[1] >= mdp.max_time:
                continue

            v = V[state]
            action_values = []

            for action in range(len(mdp.price_levels)):
                # 기대 보상 계산 (시뮬레이션 평균)
                total_value = 0
                n_samples = 100
                for _ in range(n_samples):
                    next_state, reward, done = mdp.transition(state, action)
                    if done:
                        total_value += reward
                    else:
                        total_value += reward + gamma * V[next_state]
                action_values.append(total_value / n_samples)

            V[state] = max(action_values)
            policy[state] = np.argmax(action_values)
            delta = max(delta, abs(v - V[state]))

        if delta < theta:
            break

    return V, policy

# 사용 예시
def demand_function(price, time):
    """가격과 시간에 따른 기대 수요"""
    base_demand = 10
    price_sensitivity = -0.3
    time_effect = 1 + 0.1 * time  # 시간이 지날수록 수요 증가
    return max(0, base_demand + price_sensitivity * price) * time_effect

mdp = PricingMDP(
    max_inventory=100,
    max_time=10,
    price_levels=[10, 15, 20, 25, 30],
    demand_func=demand_function,
    cost=8
)

V, policy = value_iteration(mdp)

# 최적 정책 출력
print("최적 정책 (재고, 시간) -> 가격:")
for inv in [100, 50, 20, 5]:
    for t in [0, 3, 6, 9]:
        state = (inv, t)
        if state in policy:
            optimal_price = mdp.price_levels[policy[state]]
            print(f"  재고={inv}, t={t}: ${optimal_price}")

DQN 기반 프라이싱

import torch
import torch.nn as nn

class DQNPricingAgent:
    def __init__(self, state_dim, n_price_levels, price_levels):
        self.price_levels = price_levels
        self.n_price_levels = n_price_levels
        self.epsilon = 1.0
        self.gamma = 0.99

        self.q_network = nn.Sequential(
            nn.Linear(state_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_price_levels)
        )

    def select_action(self, state, training=True):
        if training and np.random.random() < self.epsilon:
            action = np.random.randrange(self.n_price_levels)
        else:
            with torch.no_grad():
                q_values = self.q_network(torch.FloatTensor(state))
                action = q_values.argmax().item()
        return action, self.price_levels[action]

Contextual Bandits - 상태 전이 없는 특수 케이스
Thompson Sampling - 탐색-활용 전략
Policy Trees - 해석 가능한 정책

References

Puterman, M. L. (2014). Markov Decision Processes.
Sutton, R. S., & Barto, A. G. (2018). Reinforcement Learning: An Introduction.
Comprehensive Personalized Pricing Guide, Part VII, §19.1

연결 그래프