Building MDPs

A Markov Decision Process is an extended Markov Chain which

  • states

  • actions

  • transitions

Where

  • actions are essentially ‘decisions’ which might be decided by schedulers/policies/agents (these terms are equivalent)

  • each state has a number of actions

  • actions have probabilistic transitions to states

In general, for smaller models, the model API is more convenient, but for bigger models the pgc API becomes more convenient.

Here we show how to construct a simple example mdp using the pgc API and the model builder API. The idea is that you can choose to study (you will likely pass the exam but you have less free time) or not to study (you will have more free time but risk failing the exam).

[1]:
from stormvogel import pgc
from stormvogel.model import EmptyAction, ModelType
from stormvogel.show import show
from stormvogel.layout import Layout

init = pgc.State(x="")

def available_actions(s: pgc.State):
    if s == init: # If we are in the initial state, we have a choice.
        return [pgc.Action(["study"]), pgc.Action(["don't study"])]
    else: # Otherwise, we don't have any choice, we are just a Markov chain.
        return [pgc.Action([])]

def delta(s: pgc.State, a: pgc.Action):
    if "study" in a.labels:
        return [(1, pgc.State(x=["studied"]))]
    elif "don't study" in a.labels:
        return [(1, pgc.State(x=["didn't study"]))]
    elif "studied" in s.x:
        return [(9/10, pgc.State(x=["pass test"])), (1/10, pgc.State(x=["fail test"]))]
    elif "didn't study" in s.x:
        return [(2/5, pgc.State(x=["pass test"])), (3/5, pgc.State(x=["fail test"]))]
    else:
        return [(1, pgc.State(x=["end"]))]

labels = lambda s: s.x

# For rewards, you have to provide a list. This enables multiple reward models if you use a non-singleton list.
def rewards(s: pgc.State, a: pgc.Action):
    if "pass test" in s.x:
        return {"r1":100}
    if "didn't study" in s.x:
        return {"r1":15}
    else:
        return {"r1":0}


pgc_study = pgc.build_pgc(
    delta=delta,
    initial_state_pgc=init,
    available_actions=available_actions,
    labels=labels,
    modeltype=ModelType.MDP,
    rewards=rewards
)
vis = show(pgc_study, layout=Layout("layouts/pinkgreen.json"))
Test request failed. See 'Communication server remark' in docs. Disable warning by use_server=False.
Network
[2]:
import stormvogel.model
from stormvogel.layout import Layout
from stormvogel.show import show
from stormvogel.model import EmptyAction

mdp = stormvogel.model.new_mdp("Study")

init = mdp.get_initial_state()
study = mdp.action("study")
not_study = mdp.action("don't study")

studied = mdp.new_state("studied")
not_studied = mdp.new_state("didn't study")
pass_test = mdp.new_state("pass test")
fail_test = mdp.new_state("fail test")
end = mdp.new_state("end")

init.set_transitions([
    (study, studied),
    (not_study, not_studied)
])

studied.set_transitions([
    (9/10, pass_test),
    (1/10, fail_test)
])

not_studied.set_transitions([
    (4/10, pass_test),
    (6/10, fail_test)
])

pass_test.set_transitions([(1, end)])
fail_test.set_transitions([(1, end)])

reward_model = mdp.add_rewards("R")
reward_model.set_state_action_reward(pass_test, EmptyAction, 100)
reward_model.set_state_action_reward(fail_test, EmptyAction, 0)
reward_model.set_state_action_reward(not_studied, EmptyAction, 15)
reward_model.set_unset_rewards(0)

vis = show(mdp, layout=Layout("layouts/pinkgreen.json"), name="study")
Network

A famous but more involved example of an MDP is the Monty Hall problem. We will show how to construct it using the pgc API.

[3]:
from stormvogel import pgc
from stormvogel.model import EmptyAction, ModelType
from stormvogel.show import show
from stormvogel.layout import Layout

init = pgc.State(x="")

empty_action = pgc.Action([])

def available_actions(s: pgc.State):
    if "carchosen" in s.x:
        return [pgc.Action(["open", str(i)]) for i in range(1,4)]
    elif "opened" in s.x:
        return [pgc.Action(["stay"]), pgc.Action(["switch"])]
    return [empty_action]


def delta(s: pgc.State, a: pgc.Action):
    if s == init:
        return [(1/3, pgc.State(x=["carchosen", str(i)], car=i)) for i in range(1,4)]
    elif "open" in a.labels:
        return [(1, pgc.State(x=["open", str(a.labels[1])], car=s.car, chosen=a.labels[1]))]
    elif "open" in s.x:
        possible_goats = set([1,2,3]) - {s.car} - {s.chosen}
        return [(1/len(possible_goats),
                pgc.State(x=["opened", str(s.chosen), f"goat,{i}"],
                car=s.car, goat=i, chosen=s.chosen))
                for i in possible_goats]
    elif "stay" in a.labels:
        return [(1, pgc.State(x=["won"], pred=s))] if s.chosen == s.car else [(1, pgc.State(x=["lost"], pred=s))]
    elif "switch" in a.labels:
        return [(1, pgc.State(x=["won"], pred=s))] if s.chosen != s.car else [(1, pgc.State(x=["lost"], pred=s))]
    else:
        return [(1, s)]

labels = lambda s: s.x

pgc_die = pgc.build_pgc(
    delta=delta,
    initial_state_pgc=init,
    available_actions=available_actions,
    labels=labels,
    modeltype=ModelType.MDP
)

vis2 = show(pgc_die, layout=Layout("layouts/monty.json"))
Network

And here is a way to do it using the model API. Do note that this model is slightly different!

[4]:
# We create the monty hall mdp
mdp = stormvogel.model.new_mdp("Monty Hall")

init = mdp.get_initial_state()

# first choose car position
init.set_transitions(
    [(1 / 3, mdp.new_state("carchosen", {"car_pos": i})) for i in range(3)]
)

# we choose a door in each case
for s in mdp.get_states_with_label("carchosen"):
    s.set_transitions(
        [
            (
                mdp.action(f"open{i}"),
                mdp.new_state("open", s.valuations | {"chosen_pos": i}),
            )
            for i in range(3)
        ]
    )

# the other goat is revealed
for s in mdp.get_states_with_label("open"):
    car_pos = s.valuations["car_pos"]
    chosen_pos = s.valuations["chosen_pos"]
    other_pos = {0, 1, 2} - {car_pos, chosen_pos}
    s.set_transitions(
        [
            (
                1 / len(other_pos),
                mdp.new_state("goatrevealed", s.valuations | {"reveal_pos": i}),
            )
            for i in other_pos
        ]
    )

# we must choose whether we want to switch
for s in mdp.get_states_with_label("goatrevealed"):
    car_pos = s.valuations["car_pos"]
    chosen_pos = s.valuations["chosen_pos"]
    reveal_pos = s.valuations["reveal_pos"]
    other_pos = list({0, 1, 2} - {reveal_pos, chosen_pos})[0]
    s.set_transitions(
        [
            (
                mdp.action("stay"),
                mdp.new_state(
                    ["done"] + (["target"] if chosen_pos == car_pos else []),
                    s.valuations | {"chosen_pos": chosen_pos},
                ),
            ),
            (
                mdp.action("switch"),
                mdp.new_state(
                    ["done"] + (["target"] if other_pos == car_pos else []),
                    s.valuations | {"chosen_pos": other_pos},
                ),
            ),
        ]
    )

# we add self loops to all states with no outgoing transitions
mdp.add_self_loops()
vis = show(mdp, layout=Layout("layouts/monty.json"))
Network
[ ]: