Created
October 21, 2017 15:08
-
-
Save vlad17/06d453f7895322509d4f059f371625e8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
accept-process-output(#<process ycmd-server> 0 100 t) | |
ycmd--start-server("|\257}\n\362\301\325OW\333&\370I\263\324") | |
ycmd-open() | |
ycmd--request("/event_notification" (("event_name" . "FileReadyToParse") ("file_data" ("/home/vlad/Documents/cal/courses-year1/cs294-112/cal-deeprl-hw4/controllers.py" ("contents" . "import tensorflow as tf\nimport numpy as np\nimport time\nfrom utils import get_ac_dim, get_ob_dim, build_mlp\n\n\nclass Controller:\n def __init__(self):\n pass\n\n def get_action(self, state):\n raise NotImplementedError\n\n def fit(self, data):\n pass\n\n def reset(self, nstates):\n pass\n\nclass RandomController(Controller):\n def __init__(self, env):\n super().__init__()\n self.ac_space = env.action_space\n\n def get_action(self, states):\n nstates = len(states)\n return self._sample_n(nstates)\n\n def _sample_n(self, n):\n return np.random.uniform(\n low=self.ac_space.low,\n high=self.ac_space.high,\n size=(n,) + self.ac_space.shape)\n\n\nclass MPCcontroller(Controller):\n def __init__(self,\n env,\n dyn_model,\n horizon=5,\n cost_fn=None,\n num_simulated_paths=10,\n sess=None,\n policy=None):\n super().__init__()\n self.ac_dim = get_ac_dim(env)\n self.ac_space = env.action_space\n self.sess = sess\n self.num_simulated_paths = num_simulated_paths\n\n # compute the rollout in full TF to keep all computation on the GPU\n # a = action dim\n # s = state dim\n # n = batch size = num states to get MPC actions for * simulated rollouts\n # i = number of states in batch for get_action\n self.input_state_ph_is = tf.placeholder(\n tf.float32, [None, get_ob_dim(env)], 'mpc_input_state')\n state_ns = tf.tile(self.input_state_ph_is, (num_simulated_paths, 1))\n # use the specified policy during MPC rollouts\n ac_space = env.action_space\n if policy is None:\n policy = self._create_random_policy(ac_space)\n self.initial_action_na = policy(state_ns, is_initial=True)\n self.input_action_ph_na = tf.placeholder(\n tf.float32, [None, self.ac_dim], 'mpc_input_action')\n def body(t, state_ns, action_na, costs):\n next_state_ns = dyn_model.predict_tf(state_ns, action_na)\n next_costs = cost_fn(state_ns, action_na, next_state_ns, costs)\n next_action_na = policy(next_state_ns, is_initial=False)\n return [t + 1, next_state_ns, next_action_na, next_costs]\n n = tf.shape(state_ns)[0]\n loop_vars = [\n tf.constant(0),\n state_ns,\n self.input_action_ph_na,\n tf.zeros((n,))]\n self.loop = tf.while_loop(lambda t, _, __, ___: t < horizon, body,\n loop_vars, back_prop=False)\n\n @staticmethod\n def _create_random_policy(ac_space):\n def policy(state_ns, **_):\n n = tf.shape(state_ns)[0]\n ac_dim = ac_space.low.shape\n ac_na = tf.random_uniform((n,) + ac_dim)\n ac_na *= (ac_space.high - ac_space.low)\n ac_na += ac_space.low\n return ac_na\n return policy\n\n def get_action(self, states):\n nstates = len(states)\n\n action_na = self.sess.run(self.initial_action_na,\n feed_dict={self.input_state_ph_is: states})\n _, _, _, trajectory_costs_n = self.sess.run(self.loop, feed_dict={\n self.input_state_ph_is: states,\n self.input_action_ph_na: action_na})\n\n # p = num simulated paths, i = nstates\n # note b/c of the way tf.tile works we need to reshape by p then i\n per_state_simulation_costs_ip = trajectory_costs_n.reshape(\n self.num_simulated_paths, nstates).T\n best_ac_ix_i = per_state_simulation_costs_ip.argmin(axis=1)\n action_samples_ipa = np.swapaxes(action_na.reshape(\n self.num_simulated_paths, nstates, self.ac_dim), 0, 1)\n best_ac_ia = action_samples_ipa[np.arange(nstates), best_ac_ix_i, :]\n\n return best_ac_ia\n\nclass BPTT(Controller):\n def __init__(self,\n env,\n dyn_model,\n horizon=None,\n cost_fn=None,\n learning_rate=None,\n depth=None,\n width=None,\n batch_size=None,\n epochs=None,\n sess=None):\n super().__init__()\n self.sess = sess\n self.batch_size = batch_size\n self.epochs = epochs\n self.ac_space = env.action_space\n self.ob_dim = get_ob_dim(env)\n self.ac_dim = get_ac_dim(env)\n self.width = width\n self.depth = depth\n\n # rnn used by policy\n self.rnn = tf.contrib.rnn.OutputProjectionWrapper(\n tf.nn.rnn_cell.MultiRNNCell(\n [tf.nn.rnn_cell.GRUCell(width) for _ in range(depth)]),\n self.ac_dim,\n activation=tf.sigmoid)\n \n # a = action dim\n # s = state dim\n # n = batch size\n # h = hidden unit size\n self.initial_rnn_state_list_nh = [\n tf.placeholder(tf.float32, [None, width]) for _ in range(depth)]\n self.input_state_ph_ns = tf.placeholder(\n tf.float32, [None, self.ob_dim])\n self.policy_action_na, self.resulting_rnn_state_nh = self._rnn_policy(\n self.input_state_ph_ns, self.initial_rnn_state_list_nh)\n self.maintained_rnn_state = None\n\n # compute the rollout in full TF to keep all computation on the GPU\n # reuse the policy network for BPTT model-based optimization\n self.bptt_initial_state_ph_ns = tf.placeholder(\n tf.float32, [batch_size, self.ob_dim], \"bptt_input_state\")\n def body(t, state_ns, rnn_state_nh, costs_n):\n action_na, next_rnn_state_nh = self._rnn_policy(\n state_ns, rnn_state_nh)\n next_state_ns = dyn_model.predict_tf(state_ns, action_na)\n next_costs_n = cost_fn(state_ns, action_na, next_state_ns, costs_n)\n return [t + 1, next_state_ns, next_rnn_state_nh, next_costs_n]\n loop_vars = [\n tf.constant(0),\n self.bptt_initial_state_ph_ns,\n self.rnn.zero_state(batch_size, tf.float32),\n tf.zeros((batch_size,))]\n _, _, _, costs_n = tf.while_loop(\n lambda t, _, __, ___: t < horizon, body, loop_vars)\n self.mean_cost = tf.reduce_mean(costs_n)\n policy_vars = self.rnn.trainable_variables\n self.update_op = tf.train.AdamOptimizer(learning_rate).minimize(\n self.mean_cost, var_list=policy_vars)\n\n def fit(self, data):\n all_obs = data.stationary_obs()\n nexamples = len(all_obs)\n nbatches = max(nexamples // self.batch_size, 1)\n batches = np.random.randint(nexamples, size=(\n self.epochs * nbatches, self.batch_size))\n for batch_idx in batches:\n input_states_sample = all_obs[batch_idx]\n self.sess.run(self.update_op, feed_dict={\n self.bptt_initial_state_ph_ns: input_states_sample})\n\n def reset(self, nstates):\n self.maintained_rnn_state = [\n np.zeros((nstates, self.width))\n for _ in range(self.depth)]\n\n def get_action(self, states_ns):\n feed_dict = {\n self.input_state_ph_ns: states_ns}\n for layer_state_ph, layer_state in zip(self.initial_rnn_state_list_nh,\n self.maintained_rnn_state):\n feed_dict[layer_state_ph] = layer_state\n action_na, next_rnn_state_nh = self.sess.run(\n [self.policy_action_na, self.resulting_rnn_state_nh],\n feed_dict=feed_dict)\n self.maintained_rnn_state = next_rnn_state_nh\n return action_na\n\n def _rnn_policy(self, state_ns, rnn_state_nh):\n ac_na, next_rnn_state_nh = self.rnn(state_ns, rnn_state_nh)\n ac_na *= (self.ac_space.high - self.ac_space.low)\n ac_na += self.ac_space.low\n return ac_na, next_rnn_state_nh\n\nclass MPCMF(Controller):\n def __init__(self,\n env,\n dyn_model,\n horizon=None,\n cost_fn=None,\n num_simulated_paths=None,\n learning_rate=None,\n depth=None,\n width=None,\n batch_size=None,\n epochs=None,\n sess=None):\n super().__init__()\n self.sess = sess\n self.batch_size = batch_size\n self.epochs = epochs\n self.ob_dim = get_ob_dim(env)\n self.ac_dim = get_ac_dim(env)\n self.width = width\n self.depth = depth\n self.ac_space = env.action_space\n\n # create placeholder for training an MPC learner\n # a = action dim\n # s = state dim\n # n = batch size\n self.input_state_ph_ns = tf.placeholder(\n tf.float32, [None, self.ob_dim])\n self.policy_action_na = self._policy(\n self.input_state_ph_ns, is_initial=False, reuse=None)\n self.expert_action_ph_na = tf.placeholder(\n tf.float32, [None, self.ac_dim])\n mse = tf.losses.mean_squared_error(\n self.expert_action_ph_na,\n self.policy_action_na)\n \n # use the learner value to expand the MPC (first action is random)\n self.mpc = MPCcontroller(\n env, dyn_model, horizon, cost_fn, num_simulated_paths, sess,\n self._policy)\n \n self.update_op = tf.train.AdamOptimizer(learning_rate).minimize(mse)\n\n def _policy(self, state_ns, is_initial=True, reuse=True):\n def exploit_policy(state_ns):\n ac_na = build_mlp(\n state_ns, scope='mf_policy_mean',\n n_layers=self.depth, size=self.width, activation=tf.nn.relu,\n output_activation=tf.sigmoid, reuse=reuse)\n ac_na *= (self.ac_space.high - self.ac_space.low)\n ac_na += self.ac_space.low\n return ac_na\n random_policy = MPCcontroller._create_random_policy(self.ac_space)\n if is_initial:\n return random_policy(state_ns)\n else:\n return exploit_policy(state_ns)\n\n def fit(self, data):\n all_obs = data.stationary_obs()\n all_acs = data.stationary_acs()\n nexamples = len(all_obs)\n assert nexamples == len(all_acs), (nexamples, len(all_acs))\n per_epoch = max(nexamples // self.batch_size, 1)\n batches = np.random.randint(nexamples, size=(\n self.epochs * per_epoch, self.batch_size))\n for i, batch_idx in enumerate(batches, 1):\n input_states_sample = all_obs[batch_idx]\n label_actions_sample = all_acs[batch_idx]\n self.sess.run(self.update_op, feed_dict={\n self.input_state_ph_ns: input_states_sample,\n self.expert_action_ph_na: label_actions_sample})\n\n def get_action(self, states_ns):\n return self.mpc.get_action(states_ns)\n") ("filetypes" "python"))) ("filepath" . "/home/vlad/Documents/cal/courses-year1/cs294-112/cal-deeprl-hw4/controllers.py") ("line_num" . 1) ("column_num" . 1)) :parser json-read) | |
ycmd-notify-file-ready-to-parse() | |
ycmd--conditional-parse(mode-enabled) | |
ycmd-mode() | |
ycmd--maybe-enable-mode() | |
global-ycmd-mode-enable-in-buffers() | |
run-hooks(change-major-mode-after-body-hook prog-mode-hook python-mode-hook) | |
apply(run-hooks (change-major-mode-after-body-hook prog-mode-hook python-mode-hook)) | |
run-mode-hooks(python-mode-hook) | |
python-mode() | |
set-auto-mode-0(python-mode nil) | |
set-auto-mode() | |
#[0 "\300 \207" [set-auto-mode] 1 "\n\n(fn)"]() | |
funcall(#[0 "\300 \207" [set-auto-mode] 1 "\n\n(fn)"]) | |
normal-mode(t) | |
after-find-file(nil t) | |
find-file-noselect-1(#<buffer controllers.py> "~/Documents/cal/courses-year1/cs294-112/cal-deeprl-hw4/controllers.py" nil nil "~/Documents/cal/courses-year1/cs294-112/cal-deeprl-hw4/controllers.py" (5111978 64769)) | |
find-file-noselect("/home/vlad/Documents/cal/courses-year1/cs294-112/cal-deeprl-hw4/controllers.py" nil nil) | |
ido-file-internal(raise-frame) | |
ido-find-file() | |
call-interactively(ido-find-file nil nil) | |
command-execute(ido-find-file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment