Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

log on action and prob for off-policy evaluation #43

Merged
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions coba/evaluators/online.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from coba.learners import Learner, SafeLearner
from coba.primitives import Batch, argmax
from coba.statistics import percentile
from coba.utilities import PackageChecker, peek_first
from coba.utilities import PackageChecker, peek_first, sample_actions

from coba.evaluators.primitives import Evaluator, get_ope_loss

Expand Down Expand Up @@ -235,15 +235,15 @@ def evaluate(self, environment: Optional[Environment], learner: Optional[Learner
predict_time = time.time()-start_time
if not batched:
ope_reward = sum(p*float(log_rewards.eval(a)) for p,a in zip(on_probs,log_actions))
on_action, on_prob = sample_actions(log_actions, on_probs)
else:
ope_reward = [ sum(p*float(R.eval(a)) for p,a in zip(P,A)) for P,A,R in zip(on_probs,log_actions,log_rewards) ]
on_action, on_prob = zip(*[sample_actions(actions, probs) for actions, probs in zip(log_actions, on_probs)])
else:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How to do this for continuous actions?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For continuous actions we just need to call on_action,on_prob = predict(log_context, log_actions)[:2]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe on line 246? I don't think we need to have separate processing for batched and non-batched. Man I hate all this batched logic. It's all here for neural network stuff we do where backpropagation with mini-batches can give huge gains in computation time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried to add support for continuous actions but struggle to make some tests pass, see below.

start_time = time.time()
if not batched:
on_prob = request(log_context,log_actions,[log_action])
else:
on_prob = request(log_context,log_actions,log_action)
on_action, on_prob = predict(log_context, log_actions)[:2]
predict_time = time.time()-start_time

if not batched:
ope_reward = on_prob*float(log_rewards.eval(log_action))
else:
Expand All @@ -263,15 +263,15 @@ def evaluate(self, environment: Optional[Environment], learner: Optional[Learner
if record_time : out['predict_time'] = predict_time
if record_time : out['learn_time'] = learn_time
if record_reward: out['reward'] = ope_reward
if record_action: out['action'] = log_action
if record_prob: out['probability'] = log_prob
if record_action: out['action'] = on_action
if record_prob: out['probability'] = on_prob
if record_context: out['context'] = log_context
if record_actions: out['actions'] = log_actions
if record_rewards: out['rewards'] = log_rewards

out.update({k: interaction[k] for k in interaction.keys()-OffPolicyEvaluator.IMPLICIT_EXCLUDE})

if record_ope_loss: out['ope_loss'] = get_ope_loss(learner)
if record_ope_loss: out['ope_loss'] = get_ope_loss(learner) if not batched else [get_ope_loss(learner)] * len(log_context)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make OPE loss work for batched evaluation


if info:
out.update(info)
Expand Down
15 changes: 4 additions & 11 deletions coba/learners/safety.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from math import isclose
from typing import Any, Sequence, Tuple, Mapping, Literal

from coba.utilities import sample_actions
from coba.exceptions import CobaException
from coba.random import CobaRandom
from coba.primitives import Batch, Context, Action, Actions
Expand Down Expand Up @@ -266,8 +267,7 @@ def predict(self, context: Context, actions: Actions) -> Tuple[Action,Prob,kwarg
pred = list(pred.values())[0]

if self._pred_format[:2] == 'PM':
i = self._get_pmf_index(pred)
a,p = actions[i], pred[i]
a,p = sample_actions(actions, pred, self._rng)

if self._pred_format[:2] == 'AP':
a,p = pred[:2]
Expand All @@ -287,9 +287,7 @@ def predict(self, context: Context, actions: Actions) -> Tuple[Action,Prob,kwarg

A,P = [],[]
if self._pred_format[:2] == 'PM':
I = [self._get_pmf_index(p) for p in pred]
A = [ a[i] for a,i in zip(actions,I) ]
P = [ p[i] for p,i in zip(pred,I) ]
A, P = list(map(list, zip(*[sample_actions(a, p, self._rng) for a, p in zip(actions, pred)])))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could remove list(map(list, if it was ok to return a tuple instead of a list.


if self._pred_format[:2] == 'AX':
A = pred
Expand All @@ -308,9 +306,7 @@ def predict(self, context: Context, actions: Actions) -> Tuple[Action,Prob,kwarg
pred = list(pred.values())[0]

if self._pred_format[:2] == 'PM':
I = [self._get_pmf_index(p) for p in zip(*pred)]
A = [ a[i] for a,i in zip(actions,I) ]
P = [ p[i] for p,i in zip(pred,I) ]
A, P = list(map(list, zip(*[sample_actions(a, p, self._rng) for a, p in zip(actions, pred)])))

if self._pred_format[:2] == 'AX':
A = pred
Expand All @@ -335,8 +331,5 @@ def learn(self, context, action, reward, probability, **kwargs) -> None:
raise CobaException("It appears that learner.learn expected kwargs but learner.predict did not provide any.") from ex
raise

def _get_pmf_index(self,pmf):
return self._rng.choice(range(len(pmf)), pmf)

def __str__(self) -> str:
return self.full_name
13 changes: 11 additions & 2 deletions coba/tests/test_evaluators_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,8 +658,17 @@ def request(self, context, actions, request):
def test_batched_request_continuous(self):
class TestLearner:
def request(self,context,actions,request):
if isinstance(context,BatchType): raise Exception()
return .5
if isinstance(context,BatchType):
raise Exception()
return 0.5

def predict(self, context, actions):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Struggling to make this test pass.
The processing thinks it's of AX format and then always fills in Nones for the probability. I haven't worked with continuous actions before and I am not quite sure about all the different formats and in the SafeLearner
Any advice, @mrucker?

Screenshot 2023-11-06 at 12 32 28 PM

# if isinstance(context,BatchType):
# raise Exception()
return [(2, 0.5, None), (3, 0.5, None)]
# return (2, 0.5)
# return 2, 0.5, None
# return 2

task = OffPolicyEvaluator(learn=False)
learner = TestLearner()
Expand Down
27 changes: 26 additions & 1 deletion coba/tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import unittest
import unittest.mock

from coba import CobaRandom
from coba.exceptions import CobaExit, sans_tb_sys_except_hook
from coba.utilities import PackageChecker, KeyDefaultDict, coba_exit, peek_first
from coba.utilities import PackageChecker, KeyDefaultDict, coba_exit, peek_first, sample_actions


class coba_exit_Tests(unittest.TestCase):
def test_coba_exit(self):
Expand Down Expand Up @@ -122,5 +124,28 @@ def test_simple_peek_n(self):
self.assertEqual(first,[1,2,3])
self.assertEqual(list(items),[1,2,3])

class sample_actions_Tests(unittest.TestCase):

def test_sampling(self):
actions = [1,2,3]
probs = [0,0,1]
action, prob = sample_actions(actions, probs)
self.assertEqual(action, 3)
self.assertEqual(prob, 1)

def test_statistics(self):
actions = [1,2,3]
probs = [0.1, 0.2, 0.7]
action, prob = zip(*[sample_actions(actions, probs) for _ in range(10_000)])
self.assertTrue(action.count(3) > action.count(2) > action.count(1))


def test_custom_rng(self):
actions = [1,2,3]
probs = [0,0,1]
action, prob = sample_actions(actions, probs, CobaRandom(seed=1.23))
self.assertEqual(action, 3)
self.assertEqual(prob, 1)

if __name__ == '__main__':
unittest.main()
19 changes: 18 additions & 1 deletion coba/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@

from itertools import chain, islice
from collections import defaultdict
from typing import TypeVar, Iterable, Tuple, Union, Sequence
from typing import TypeVar, Iterable, Tuple, Union, Sequence, Any, Optional

from coba import CobaRandom
from coba.exceptions import CobaExit
from coba.random import choice


def coba_exit(message:str):
#we ignore warnings before exiting in order to make jupyter's output a little cleaner
Expand Down Expand Up @@ -122,3 +125,17 @@ def peek_first(items: Iterable[_T], n:int=1) -> Tuple[Union[_T,Sequence[_T]], It
first = None if not first else first[0] if n==1 else first

return first, items


def sample_actions(
actions: Sequence[Any],
probabilities: Sequence[float],
rng: Optional[CobaRandom] = None,
) -> Tuple[Any, float]:
"""
Sample the actions weighted by their probabilities.
"""
choice_function = rng.choice if rng else choice
index = choice_function(range(len(probabilities)), probabilities)

return actions[index], probabilities[index]
Loading