Building MDPs

In Stormvogel, a Markov Decision Process (MDP) consists of:

  • states \(S\),

  • actions \(A\),

  • an initial state \(s_0\),

  • a mapping from states to sets of enabled actions,

  • a successor distribution \(P(s,a)\) for every state \(s\) and every enabled action \(a\), i.e., sets of transitions between states \(s\) and \(s'\), each annotated with an action and a probability.

  • state labels \(L(s)\).

Here we show how to construct a simple example mdp using the pgc API and the model builder API. The idea is that you can choose to study (you will likely pass the exam but you have less free time) or not to study (you will have more free time but risk failing the exam).

[1]:
from stormvogel import pgc
from stormvogel.model import EmptyAction, ModelType
from stormvogel.show import show
from stormvogel.layout import Layout

init = pgc.State(x="")

def available_actions(s: pgc.State):
    if s == init: # If we are in the initial state, we have a choice.
        return [pgc.Action(["study"]), pgc.Action(["don't study"])]
    else: # Otherwise, we don't have any choice, we are just a Markov chain.
        return [pgc.Action([])]

def delta(s: pgc.State, a: pgc.Action):
    if "study" in a.labels:
        return [(1, pgc.State(x=["studied"]))]
    elif "don't study" in a.labels:
        return [(1, pgc.State(x=["didn't study"]))]
    elif "studied" in s.x:
        return [(9/10, pgc.State(x=["pass test"])), (1/10, pgc.State(x=["fail test"]))]
    elif "didn't study" in s.x:
        return [(2/5, pgc.State(x=["pass test"])), (3/5, pgc.State(x=["fail test"]))]
    else:
        return [(1, pgc.State(x=["end"]))]

labels = lambda s: s.x

# For rewards, you have to provide a list. This enables multiple reward models if you use a non-singleton list.
def rewards(s: pgc.State, a: pgc.Action):
    if "pass test" in s.x:
        return {"r1":100}
    if "didn't study" in s.x:
        return {"r1":15}
    else:
        return {"r1":0}


pgc_study = pgc.build_pgc(
    delta=delta,
    initial_state_pgc=init,
    available_actions=available_actions,
    labels=labels,
    modeltype=ModelType.MDP,
    rewards=rewards
)
vis = show(pgc_study, layout=Layout("layouts/pinkgreen.json"))
Test request failed. See 'Communication server remark' in docs. Disable warning by use_server=False.
Network
[2]:
import stormvogel.model
from stormvogel.layout import Layout
from stormvogel.show import show
from stormvogel.model import EmptyAction

mdp = stormvogel.model.new_mdp("Study")

init = mdp.get_initial_state()
study = mdp.action("study")
not_study = mdp.action("don't study")

studied = mdp.new_state("studied")
not_studied = mdp.new_state("didn't study")
pass_test = mdp.new_state("pass test")
fail_test = mdp.new_state("fail test")
end = mdp.new_state("end")

init.set_transitions([
    (study, studied),
    (not_study, not_studied)
])

studied.set_transitions([
    (9/10, pass_test),
    (1/10, fail_test)
])

not_studied.set_transitions([
    (4/10, pass_test),
    (6/10, fail_test)
])

pass_test.set_transitions([(1, end)])
fail_test.set_transitions([(1, end)])

reward_model = mdp.add_rewards("R")
reward_model.set_state_action_reward(pass_test, EmptyAction, 100)
reward_model.set_state_action_reward(fail_test, EmptyAction, 0)
reward_model.set_state_action_reward(not_studied, EmptyAction, 15)
reward_model.set_unset_rewards(0)

vis = show(mdp, layout=Layout("layouts/pinkgreen.json"))
Network

A famous but more involved example of an MDP is the Monty Hall problem. We will show how to construct it using the pgc API.

[3]:
from stormvogel import pgc
from stormvogel.model import EmptyAction, ModelType
from stormvogel.show import show
from stormvogel.layout import Layout

init = pgc.State(x="")

empty_action = pgc.Action([])

def available_actions(s: pgc.State):
    if "carchosen" in s.x:
        return [pgc.Action(["open", str(i)]) for i in range(1,4)]
    elif "opened" in s.x:
        return [pgc.Action(["stay"]), pgc.Action(["switch"])]
    return [empty_action]


def delta(s: pgc.State, a: pgc.Action):
    if s == init:
        return [(1/3, pgc.State(x=["carchosen", str(i)], car=i)) for i in range(1,4)]
    elif "open" in a.labels:
        return [(1, pgc.State(x=["open", str(a.labels[1])], car=s.car, chosen=a.labels[1]))]
    elif "open" in s.x:
        possible_goats = set([1,2,3]) - {s.car} - {s.chosen}
        return [(1/len(possible_goats),
                pgc.State(x=["opened", str(s.chosen), f"goat,{i}"],
                car=s.car, goat=i, chosen=s.chosen))
                for i in possible_goats]
    elif "stay" in a.labels:
        return [(1, pgc.State(x=["won"], pred=s))] if s.chosen == s.car else [(1, pgc.State(x=["lost"], pred=s))]
    elif "switch" in a.labels:
        return [(1, pgc.State(x=["won"], pred=s))] if s.chosen != s.car else [(1, pgc.State(x=["lost"], pred=s))]
    else:
        return [(1, s)]

labels = lambda s: s.x

pgc_die = pgc.build_pgc(
    delta=delta,
    initial_state_pgc=init,
    available_actions=available_actions,
    labels=labels,
    modeltype=ModelType.MDP
)

vis2 = show(pgc_die, layout=Layout("layouts/monty.json"))
Network

And here is a way to do it using the model API. Do note that this model is slightly different!

[4]:
# We create the monty hall mdp
mdp = stormvogel.model.new_mdp("Monty Hall")

init = mdp.get_initial_state()

# first choose car position
init.set_transitions(
    [(1 / 3, mdp.new_state("carchosen", {"car_pos": i})) for i in range(3)]
)

# we choose a door in each case
for s in mdp.get_states_with_label("carchosen"):
    s.set_transitions(
        [
            (
                mdp.action(f"open{i}"),
                mdp.new_state("open", s.valuations | {"chosen_pos": i}),
            )
            for i in range(3)
        ]
    )

# the other goat is revealed
for s in mdp.get_states_with_label("open"):
    car_pos = s.valuations["car_pos"]
    chosen_pos = s.valuations["chosen_pos"]
    other_pos = {0, 1, 2} - {car_pos, chosen_pos}
    s.set_transitions(
        [
            (
                1 / len(other_pos),
                mdp.new_state("goatrevealed", s.valuations | {"reveal_pos": i}),
            )
            for i in other_pos
        ]
    )

# we must choose whether we want to switch
for s in mdp.get_states_with_label("goatrevealed"):
    car_pos = s.valuations["car_pos"]
    chosen_pos = s.valuations["chosen_pos"]
    reveal_pos = s.valuations["reveal_pos"]
    other_pos = list({0, 1, 2} - {reveal_pos, chosen_pos})[0]
    s.set_transitions(
        [
            (
                mdp.action("stay"),
                mdp.new_state(
                    ["done"] + (["target"] if chosen_pos == car_pos else []),
                    s.valuations | {"chosen_pos": chosen_pos},
                ),
            ),
            (
                mdp.action("switch"),
                mdp.new_state(
                    ["done"] + (["target"] if other_pos == car_pos else []),
                    s.valuations | {"chosen_pos": other_pos},
                ),
            ),
        ]
    )

# we add self loops to all states with no outgoing transitions
mdp.add_self_loops()
vis = show(mdp, layout=Layout("layouts/monty.json"))
Network