parent
59e2b0291c
commit
0c3bc404f0
@ -0,0 +1,543 @@ |
|||||||
|
#!/usr/bin/env python3 |
||||||
|
# -*- coding: utf-8 -*- |
||||||
|
""" |
||||||
|
Created on Tue Feb 6 19:02:32 2024 |
||||||
|
|
||||||
|
@author: marta |
||||||
|
""" |
||||||
|
|
||||||
|
import numpy as np |
||||||
|
import math |
||||||
|
#import plotly.graph_objects as go |
||||||
|
#from tqdm.notebook import tqdm |
||||||
|
#import plotly.express as px |
||||||
|
import matplotlib as mpl |
||||||
|
mpl.rcParams['figure.dpi'] = 300 |
||||||
|
import matplotlib.pyplot as plt |
||||||
|
#import seaborn as sns |
||||||
|
import os |
||||||
|
#from wand.image import Image as WImage |
||||||
|
# sns.set(palette="husl",font_scale=1) |
||||||
|
# %config InlineBackend.figure_format = 'retina' |
||||||
|
import copy |
||||||
|
np.random.seed(69732) |
||||||
|
#%load_ext line_profiler |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#Define constants |
||||||
|
|
||||||
|
#L = 2*np.pi # periodic domain size |
||||||
|
L=50 |
||||||
|
# define boundaries of simulation box |
||||||
|
x0 = 0 |
||||||
|
x1 = L |
||||||
|
z0 = 0 |
||||||
|
z1 = L |
||||||
|
|
||||||
|
# define reinforcement learning problem |
||||||
|
N_states = 4 # number of states - one for each coarse-grained degree of vorticity |
||||||
|
N_actions = 2 # number of actions - one for each coarse-grained swimming direction |
||||||
|
|
||||||
|
# numerical parameters |
||||||
|
#dt = 0.00001 # timestep size |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#Define useful data structures |
||||||
|
#Define a dictionary of the possible states and their assigned indices |
||||||
|
|
||||||
|
distance_states = ["ri", "rni"] #ri es rij<rct y rni es rij>rct |
||||||
|
frecuency_states = ["wo", "wh"] #wo es w<wc y wh es w>wc |
||||||
|
product_states = [(x,y) for x in distance_states for y in frecuency_states] # all possible states |
||||||
|
state_lookup_table = {product_states[i]:i for i in range(len(product_states))} # returns index of given state |
||||||
|
# print(product_states) # to view mapping |
||||||
|
|
||||||
|
#Define an agent class for reinforcement learning |
||||||
|
|
||||||
|
class Agent: |
||||||
|
def __init__(self, Ns): |
||||||
|
self.r = np.zeros(Ns) # reward for each stage |
||||||
|
self.t = 0 # time |
||||||
|
|
||||||
|
# calculate reward given from entering a new state after a selected action is undertaken |
||||||
|
def calc_reward(self): |
||||||
|
# enforce implementation by subclass |
||||||
|
if self.__class__ == AbstractClass: |
||||||
|
raise NotImplementedError |
||||||
|
|
||||||
|
def update_state(self): |
||||||
|
# enforce implementation by subclass |
||||||
|
if self.__class__ == AbstractClass: |
||||||
|
raise NotImplementedError |
||||||
|
|
||||||
|
def take_random_action(self): |
||||||
|
# enforce implementation by subclass |
||||||
|
if self.__class__ == AbstractClass: |
||||||
|
raise NotImplementedError |
||||||
|
|
||||||
|
def take_greedy_action(self, Q): |
||||||
|
# enforce implementation by subclass |
||||||
|
if self.__class__ == AbstractClass: |
||||||
|
raise NotImplementedError |
||||||
|
|
||||||
|
#Define swimmer class derived from agent |
||||||
|
|
||||||
|
class Swimmer(Agent): |
||||||
|
def __init__(self, Ns, ni, sigma): |
||||||
|
# call init for superclass |
||||||
|
super().__init__(Ns) |
||||||
|
|
||||||
|
self.ni = ni |
||||||
|
self.sigma = sigma |
||||||
|
|
||||||
|
#obstáculos |
||||||
|
self.obstacles= self.generate_obstacles() |
||||||
|
|
||||||
|
#Condición inicial para el swimmer |
||||||
|
self.X = np.array([np.random.uniform(-.5*L, .5*L), np.random.uniform(-.5*L, .5*L), 0]) |
||||||
|
self.init_pos = np.array([0., 0., 0.]) |
||||||
|
valid_initial_position = False |
||||||
|
while not valid_initial_position: |
||||||
|
self.X = np.array([np.random.uniform(-.5*L, .5*L), np.random.uniform(-.5*L, .5*L), 0]) |
||||||
|
valid_initial_position = True |
||||||
|
# Comprobamos si esta inicialmente esta dentro de un obstáculo |
||||||
|
for i in range(len(self.obstacles)//2): |
||||||
|
obstacle_position = np.array([self.obstacles[2*i], self.obstacles[2*i+1], 0]) |
||||||
|
if np.linalg.norm(self.X - obstacle_position) < 0.8*self.sigma: |
||||||
|
valid_initial_position = False |
||||||
|
break |
||||||
|
self.init_pos = self.X |
||||||
|
# absolute position. -inf. <= x_total < inf. and -inf. <= z_total < inf. |
||||||
|
self.X_total = self.X |
||||||
|
|
||||||
|
# particle orientation |
||||||
|
self.theta = np.random.uniform(0, 2*np.pi) # polar angle theta in the x-z plane |
||||||
|
self.p = np.array([np.cos(self.theta), np.sin(self.theta)]) # p = [px, pz]^T |
||||||
|
|
||||||
|
# translational and rotational velocity |
||||||
|
self.U = np.zeros(3, float) |
||||||
|
self.W = np.array([0., 0., 1.]) #Velocidad angular aleatoria |
||||||
|
|
||||||
|
#distancia entre el swimmer y el obstáculo |
||||||
|
self.R=np.random.uniform(0, 2.5, 1) |
||||||
|
|
||||||
|
# history of local and global position. Only store information for this episode. |
||||||
|
self.history_X = [self.X] |
||||||
|
self.history_X_total = [self.X_total] |
||||||
|
|
||||||
|
|
||||||
|
# update coarse-grained state |
||||||
|
self.update_state() |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def generate_obstacles(self): |
||||||
|
obstacles=[] #el numero de obstáculos será 10*10 |
||||||
|
cell_spacing= L/20. |
||||||
|
ncells = 20 |
||||||
|
|
||||||
|
for i in range(ncells): |
||||||
|
for j in range(ncells): |
||||||
|
obstacle_x= i*cell_spacing - .5*L + .5*cell_spacing |
||||||
|
obstacle_y= j*cell_spacing - .5*L + .5*cell_spacing |
||||||
|
obstacles.append(obstacle_x) |
||||||
|
obstacles.append(obstacle_y) |
||||||
|
|
||||||
|
return obstacles |
||||||
|
|
||||||
|
|
||||||
|
def interaction_with_obstacles_numpy(self, obstacles, kappa, alpha, beta, gamma, Pe, Restar, dt): |
||||||
|
F = np.array([0.,0.,0.]) |
||||||
|
|
||||||
|
#midpoint integration |
||||||
|
nobs = len(obstacles)//2 |
||||||
|
mid_obs_array = np.array(obstacles).reshape(nobs, -1) # nobs x 2 array |
||||||
|
obs_array = np.hstack((mid_obs_array, np.zeros((nobs, 1)))) # add z=0 column |
||||||
|
del mid_obs_array |
||||||
|
r_nopbc = np.atleast_2d(self.X).repeat(nobs, axis=0) - obs_array |
||||||
|
r = r_nopbc + L*np.floor(-r_nopbc/L + .5) |
||||||
|
r[:, 2] = 0. |
||||||
|
rnorm = np.linalg.norm(r, axis=1) |
||||||
|
Re = (.5*self.sigma)**2*np.linalg.norm(self.W)/self.ni |
||||||
|
S = np.reciprocal(1+np.exp(-kappa*(Re*Re*Re*np.power(rnorm, -3.)-Restar))) |
||||||
|
F1 = alpha*np.sum(np.multiply(Re*Re*Re*np.power(rnorm, -3.)*rnorm, S))*np.cross(self.U,self.W) |
||||||
|
F2 = beta*np.sum(np.multiply(np.cross(np.atleast_2d(self.W).repeat(nobs, axis=0), r), np.power(rnorm, -3.)[:, None]), axis=0) |
||||||
|
F_attr = gamma*np.sum(((np.exp(-rnorm/kappa)*np.power(rnorm, -2))*(kappa+rnorm))[:, None]*r, axis=0) |
||||||
|
F = F1+F2+F_attr |
||||||
|
|
||||||
|
xi=np.random.normal(0,1, size=2) #vector de números aleatorios generados a partir de una distribución normal estándar con dos componentes, xi creo que es un vector de ruido estocástico (modela el ruido térmico) |
||||||
|
dr_therm_1 = np.sqrt(2*self.sigma**2*(.5*dt)/Pe)*xi |
||||||
|
xi=np.random.normal(0,1, size=2) |
||||||
|
dr_therm_2 = np.sqrt(2*self.sigma**2*(.5*dt)/Pe)*xi |
||||||
|
|
||||||
|
dr_mid = F[:-1]*.5*dt + dr_therm_1 |
||||||
|
self.U = np.array([2*dr_mid[0]/dt, 2*dr_mid[1]/dt, 0]) |
||||||
|
|
||||||
|
r_nopbc = np.atleast_2d(self.X + np.append(dr_mid, np.zeros(1))).repeat(nobs, axis=0) - obs_array |
||||||
|
r = r_nopbc + L*np.floor(r_nopbc/L + .5) |
||||||
|
r[:, 2] = 0. |
||||||
|
rnorm = np.linalg.norm(r, axis=1) |
||||||
|
Re = (.5*self.sigma)**2*np.linalg.norm(self.W)/self.ni |
||||||
|
S = np.reciprocal(1+np.exp(-kappa*(Re*Re*Re*np.power(rnorm, -3.)-Restar))) |
||||||
|
F1 = alpha*np.sum(np.multiply(Re*Re*Re*np.power(rnorm, -3.)*rnorm, S))*np.cross(self.U,self.W) |
||||||
|
F2 = beta*np.sum(np.multiply(np.cross(np.atleast_2d(self.W).repeat(nobs, axis=0), r), np.power(rnorm, -3.)[:, None]), axis=0) |
||||||
|
F_attr = gamma*np.sum(((np.exp(-rnorm/kappa)*np.power(rnorm, -2))*(kappa+rnorm))[:, None]*r, axis=0) |
||||||
|
F = F1+F2+F_attr |
||||||
|
|
||||||
|
dr = F[:-1]*dt + dr_therm_1 + dr_therm_2 |
||||||
|
#actualizamos la posición del spinner |
||||||
|
self.X[:-1] += dr |
||||||
|
self.X_total[:-1] += dr |
||||||
|
self.U = np.array([dr[0]/dt, dr[1]/dt, 0]) |
||||||
|
|
||||||
|
self.R = np.amin(rnorm) |
||||||
|
#comprobamos que el spinner siga dentro del box periódico |
||||||
|
self.periodic_boundaries() |
||||||
|
|
||||||
|
|
||||||
|
def reinitialize(self, obstacles): |
||||||
|
# absolute position. -inf. <= x_total < inf. and -inf. <= z_total < inf. |
||||||
|
self.X = self.init_pos |
||||||
|
self.X_total = self.init_pos |
||||||
|
|
||||||
|
# particle orientation |
||||||
|
self.theta = np.random.uniform(0, 2*np.pi) # polar angle theta in the x-z plane |
||||||
|
self.p = np.array([np.cos(self.theta), np.sin(self.theta)]) # p = [px, pz]^T |
||||||
|
|
||||||
|
# translational and rotational velocity |
||||||
|
self.U = np.zeros(3, float) |
||||||
|
self.W = np.array([0., 0., 1.]) #Velocidad angular aleatoria |
||||||
|
|
||||||
|
#distancia entre el swimmer y el obstáculo |
||||||
|
self.R=np.random.uniform(0, 2.5, 1) |
||||||
|
|
||||||
|
# history of local and global position. Only store information for this episode. |
||||||
|
self.history_X.clear() |
||||||
|
self.history_X_total.clear() |
||||||
|
self.history_X.append(self.X) |
||||||
|
self.history_X_total.append(self.X_total) |
||||||
|
|
||||||
|
# update coarse-grained state |
||||||
|
#self.update_state() |
||||||
|
|
||||||
|
self.t = 0 |
||||||
|
self.obstacles = obstacles |
||||||
|
|
||||||
|
|
||||||
|
def calc_reward(self, n): |
||||||
|
first_r = self.history_X_total[-1][1]-self.history_X_total[-2][1] |
||||||
|
if first_r >= .5*L: |
||||||
|
print('low PBC') |
||||||
|
self.r[n] = first_r - 1.*L |
||||||
|
elif first_r <= -.5*L: |
||||||
|
print('high PBC') |
||||||
|
self.r[n] = first_r + 1.*L |
||||||
|
else: |
||||||
|
self.r[n] = first_r |
||||||
|
|
||||||
|
def update_state(self): |
||||||
|
|
||||||
|
#self.distance_obstacles() |
||||||
|
#componente z de la velocidad angular |
||||||
|
W_z = self.W[2] |
||||||
|
if W_z <= 0.175*self.ni/(.5*self.sigma*self.sigma): |
||||||
|
W_state = "wo" |
||||||
|
elif W_z > 0.175*self.ni/(.5*self.sigma*self.sigma): |
||||||
|
W_state = "wh" |
||||||
|
else: |
||||||
|
raise Exception("Invalid value of w detected: ", W_z) |
||||||
|
|
||||||
|
if self.R <= 1.25: |
||||||
|
R_state = "ri" |
||||||
|
elif self.R > 1.25: |
||||||
|
R_state = "rni" |
||||||
|
else: |
||||||
|
raise Exception ("Invalid value of r detected: ", self.R) |
||||||
|
|
||||||
|
|
||||||
|
self.my_state = (R_state, W_state) |
||||||
|
|
||||||
|
def take_greedy_action(self, Q): |
||||||
|
state_index = state_lookup_table[self.my_state] |
||||||
|
action_index = np.argmax(Q[state_index]) # find largest entry in this row of Q (i.e. this state) |
||||||
|
Wc=0.175*self.ni/(.5*self.sigma*self.sigma) |
||||||
|
if action_index == 0: # aumenta 0.01/8W |
||||||
|
self.W[2] += .01/8*Wc |
||||||
|
elif action_index == 1: # disminuye 0.01/8W |
||||||
|
self.W[2] -= .01/8*Wc |
||||||
|
else: |
||||||
|
raise Exception ("Action index out of bounds: ", action_index) |
||||||
|
return action_index |
||||||
|
|
||||||
|
def take_random_action(self): |
||||||
|
action_index = np.random.randint(0, 2, 1) |
||||||
|
Wc=0.175*self.ni/(.5*self.sigma*self.sigma) |
||||||
|
if action_index == 0: # aumenta 0.01/8W |
||||||
|
self.W[2] += .01/8*Wc |
||||||
|
else: # disminuye 0.01/8W |
||||||
|
self.W[2] -= .01/8*Wc |
||||||
|
return action_index |
||||||
|
|
||||||
|
def periodic_boundaries(self, isxperiodic=True, isyperiodic=True, iszperiodic=True): |
||||||
|
|
||||||
|
offset = [math.floor(-self.X[0] * 1./L + 0.5), |
||||||
|
math.floor(-self.X[1] * 1./L + 0.5), |
||||||
|
0] |
||||||
|
|
||||||
|
if isxperiodic: |
||||||
|
self.X[0] += offset[0] * L |
||||||
|
if isyperiodic: |
||||||
|
self.X[1] += offset[1] * L |
||||||
|
if iszperiodic: |
||||||
|
self.X[2] += offset[2] * L |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def training(alpha0,kappa,alphaMAG,beta,gammaYUK,Pe,dt, ni, sigma, Ns=4000, Ne=5000, Naction=100, gamma=0.999, eps0=0.0, n_updates=1000, \ |
||||||
|
RIC=False, method="Qlearning", lr_decay=None, omega=0.85, eps_decay=True, Qin=None): |
||||||
|
# n_updates - how often to plot the trajectory undertaken by the particle during the learning process |
||||||
|
# Ne - number of episodes |
||||||
|
# Ns - number of steps in an episode |
||||||
|
# alpha0 - learning rate (or starting learning rate when employing LR decay) |
||||||
|
# gamma - discount factor, i.e. how much we weigh future to present rewards. Close to 0 = myopic view. |
||||||
|
# eps0 - fraction of the time we allow for exploration in selecting the following action. 0 = always greedy. |
||||||
|
# D0 - translational diffusivity |
||||||
|
# Dr - rotational diffusivity |
||||||
|
# RIC - Reset of Initial Conditions. First time a state-action pair is encountered, set Q[s,a] = reward |
||||||
|
# method - choose from Q-learning, Double Q-learning (, or Expected SARSA |
||||||
|
# lr_decay - whether or not to use learning rate decay. Options are none, or polynomial (lr=1/#(s,a)**omega) |
||||||
|
# omega - exponent used in lr_decay: lr = 1/#(s,a)**omega |
||||||
|
# eps_decay - whether or not to decay epsilon linearly: eff_eps = eps0/k for the k-th step |
||||||
|
# Qin - initial Q matrix. Useful for testing performance after an extensive exploration phase. |
||||||
|
|
||||||
|
# if using the expected SARSA method, turn on epsilon decay since eps = 0 is simply Q-learning anyway |
||||||
|
if method=="expSARSA": |
||||||
|
eps_decay = True |
||||||
|
if eps0 == 0: eps0 = 1 |
||||||
|
|
||||||
|
# Total reward for each episode |
||||||
|
hist_R_tot_smart = np.zeros(Ne) |
||||||
|
hist_R_tot_naive= np.zeros(Ne) |
||||||
|
|
||||||
|
# learning gain per episode |
||||||
|
Σ = np.zeros(Ne) |
||||||
|
|
||||||
|
smart_stored_histories = [] # store position = f(t) every so often for an episode (smart particles) |
||||||
|
naive_stored_histories = [] # store position = f(t) every so often for an episode (naive particles) |
||||||
|
|
||||||
|
# number of times each state-action pair has been explored |
||||||
|
state_action_counter = np.zeros((N_states,N_actions)) |
||||||
|
|
||||||
|
# initialize a naive and a smart gyrotactic particle |
||||||
|
naive = Swimmer(Ns, ni, sigma) |
||||||
|
smart = Swimmer(Ns, ni, sigma) |
||||||
|
naive.obstacles = smart.obstacles |
||||||
|
obstacles=naive.obstacles.copy() |
||||||
|
init_pos = copy.deepcopy(smart.X) |
||||||
|
print(init_pos.base is None) |
||||||
|
# initialize Q matrix to large value |
||||||
|
if method=="doubleQ": |
||||||
|
Q1 = L*Ns*np.ones((4, 2)) |
||||||
|
Q2 = L*Ns*np.ones((4, 2)) |
||||||
|
else: |
||||||
|
Q = L*Ns*np.ones((4, 2)) # 4 states, 2 possible actions. Each column is an action, w. |
||||||
|
|
||||||
|
if Qin is not None: Q = Qin |
||||||
|
|
||||||
|
# store average Q for each episode to track convergence |
||||||
|
avg_Q_history = np.zeros((Ne,4,2)) |
||||||
|
|
||||||
|
# store initial position and orientation for each episode |
||||||
|
initial_coords = np.empty([Ne, 3], float) |
||||||
|
for k in range(Ne): |
||||||
|
initial_coords[k,:]=smart.X |
||||||
|
|
||||||
|
# iterate over episodes |
||||||
|
k = 0 |
||||||
|
for ep in range(Ne): |
||||||
|
|
||||||
|
# assign random orientation and position |
||||||
|
#print(init_pos) |
||||||
|
#print(smart.X) |
||||||
|
smart.init_pos = init_pos.copy() |
||||||
|
naive.init_pos = init_pos.copy() |
||||||
|
smart.reinitialize(obstacles) |
||||||
|
naive.reinitialize(obstacles) |
||||||
|
naive = copy.deepcopy(smart) # have naive and smart share initial conditions for visualization purposes |
||||||
|
|
||||||
|
# store initialization |
||||||
|
initial_coords[ep,0:3] = smart.X |
||||||
|
|
||||||
|
# save selected actions and particle orientation for last episodes |
||||||
|
if ep == Ne - 1: |
||||||
|
chosen_actions = np.zeros(Ns) |
||||||
|
theta_history = np.zeros(Ns) |
||||||
|
|
||||||
|
# iterate over stages within an episode |
||||||
|
#we have to store data like this because otherwise all history will be rewritten by the final element??? |
||||||
|
smart_history_X_total = np.zeros((Ns, 3), float) |
||||||
|
naive_history_X_total = np.zeros((Ns, 3), float) |
||||||
|
for stage in range(Ns): |
||||||
|
|
||||||
|
# select an action eps-greedily. Note naive never changes its action/strategy (i.e. trying to swim up) |
||||||
|
Qinput = Q1 + Q2 if method=="doubleQ" else Q |
||||||
|
k = k + 1 # k-th update |
||||||
|
|
||||||
|
eff_eps = eps0/k**omega if eps_decay else eps0 # decrease amount of exploration as time proceeds |
||||||
|
if np.random.uniform(0, 1) < eff_eps: |
||||||
|
action = smart.take_random_action() |
||||||
|
else: |
||||||
|
action = smart.take_greedy_action(Qinput) |
||||||
|
|
||||||
|
# record action and orientation on last episode |
||||||
|
if ep == Ne - 1: |
||||||
|
chosen_actions[stage] = action |
||||||
|
theta_history[stage] = smart.theta |
||||||
|
|
||||||
|
# record index of the prior state |
||||||
|
old_s = state_lookup_table[smart.my_state] |
||||||
|
|
||||||
|
# given selected action, update the state |
||||||
|
for step in range(Naction): |
||||||
|
naive.interaction_with_obstacles_numpy(naive.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe, .094, dt) |
||||||
|
smart.interaction_with_obstacles_numpy(smart.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe, .094, dt) |
||||||
|
|
||||||
|
#print(init_pos) |
||||||
|
smart.history_X_total.append(smart.X_total.copy()) |
||||||
|
#print('works', smart.history_X_total[-1]) |
||||||
|
smart.history_X.append(smart.X.copy()) |
||||||
|
naive.history_X_total.append(naive.X_total.copy()) |
||||||
|
naive.history_X.append(naive.X.copy()) |
||||||
|
smart_history_X_total[stage, :] = smart.X_total.copy() |
||||||
|
naive_history_X_total[stage, :] = naive.X_total.copy() |
||||||
|
#print(smart.history_X_total) |
||||||
|
smart.update_state() # only need to update smart particle since naive has ka = [0, 1] always |
||||||
|
#print(ep, smart.R, smart.W[2]) |
||||||
|
|
||||||
|
# calculate reward based on new state |
||||||
|
naive.calc_reward(stage) |
||||||
|
smart.calc_reward(stage) |
||||||
|
|
||||||
|
new_s = state_lookup_table[smart.my_state] |
||||||
|
state_action_counter[new_s,action] += 1 |
||||||
|
|
||||||
|
# employ learning rate decay if applicable |
||||||
|
alpha = alpha0/(1+state_action_counter[old_s,action])**omega if lr_decay else alpha0 |
||||||
|
|
||||||
|
# update Q matrix |
||||||
|
if method=="doubleQ": |
||||||
|
if np.random.uniform(0, 1) < 0.5: # update Q1 |
||||||
|
if Q1[old_s, action] == L*Ns and RIC==True: # apply Reset of Initial Conditions (RIC) |
||||||
|
Q1[old_s, action] = smart.r[stage] |
||||||
|
else: |
||||||
|
Q1[old_s, action] = Q1[old_s, action] + alpha*(smart.r[stage] + \ |
||||||
|
gamma*np.max(Q2[new_s,:])-Q1[old_s,action]) |
||||||
|
else: # update Q2 |
||||||
|
if Q2[old_s, action] == L*Ns and RIC==True: |
||||||
|
Q2[old_s, action] = smart.r[stage] |
||||||
|
else: |
||||||
|
Q2[old_s, action] = Q2[old_s, action] + alpha*(smart.r[stage] + \ |
||||||
|
gamma*np.max(Q1[new_s,:])-Q2[old_s,action]) |
||||||
|
if method=="expSARSA": |
||||||
|
# calculate V, the expected Q value for the next state-actio pair |
||||||
|
V = 0 |
||||||
|
greedy_action = np.argmax(Q[new_s]) # would-be greedy action for new state |
||||||
|
for new_action in range(N_actions): |
||||||
|
pi = (1 - eff_eps) + eff_eps/N_actions if new_action == greedy_action else eff_eps/N_actions |
||||||
|
V = V + pi*Q[new_s, new_action] |
||||||
|
|
||||||
|
if Q[old_s, action] == L*Ns and RIC==True: |
||||||
|
Q[old_s, action] = smart.r[stage] |
||||||
|
else: |
||||||
|
Q[old_s, action] = Q[old_s, action] + alpha*(smart.r[stage] + gamma*V - Q[old_s,action]) |
||||||
|
else: |
||||||
|
if Q[old_s, action] == L*Ns and RIC==True: |
||||||
|
Q[old_s, action] = smart.r[stage] |
||||||
|
else: |
||||||
|
Q[old_s, action] = Q[old_s, action] + alpha*(smart.r[stage] + \ |
||||||
|
gamma*np.max(Q[new_s,:])-Q[old_s,action]) |
||||||
|
|
||||||
|
#print('test', smart.history_X_total[-1]) |
||||||
|
# store average Q for each episode to track convergence |
||||||
|
avg_Q_history[ep] = avg_Q_history[ep] + Q1 + Q2 if method=="doubleQ" else avg_Q_history[ep] + Q |
||||||
|
|
||||||
|
avg_Q_history[ep] = avg_Q_history[ep]/Ns |
||||||
|
|
||||||
|
# calculate Rtot for this episode |
||||||
|
R_tot_naive = np.sum(naive.r) |
||||||
|
R_tot_smart = np.sum(smart.r) |
||||||
|
print('Episode %d, reward: %.5f'%(ep, R_tot_smart)) |
||||||
|
|
||||||
|
if R_tot_naive < .0000001: |
||||||
|
R_tot_naive = .0000001 |
||||||
|
# calculate learning gain for this episode |
||||||
|
Σ[ep] = R_tot_smart/R_tot_naive - 1 |
||||||
|
hist_R_tot_smart[ep] = R_tot_smart |
||||||
|
hist_R_tot_naive[ep] = R_tot_naive |
||||||
|
|
||||||
|
# plot trajectory every so often |
||||||
|
if ep%n_updates==0 or ep==Ne-1: |
||||||
|
smart_stored_histories.append((ep,smart_history_X_total)) |
||||||
|
naive_stored_histories.append((ep,naive_history_X_total)) |
||||||
|
|
||||||
|
# save optimal policy |
||||||
|
if ep==Ne-1: |
||||||
|
filename = "Policies/Q_alpha_" + str(alpha).replace(".","d") + "_Ns_" + str(Ns) + "_Ne_" + str(Ne) + \ |
||||||
|
"_sigma_" + str(sigma).replace(".","d") + "_Pe_" + str(Pe).replace(".","d") + "_eps_" \ |
||||||
|
+ str(eff_eps).replace(".","d") + "_epsdecay_" + str(eps_decay) |
||||||
|
if lr_decay: filename = filename + "_omega_" + str(omega) |
||||||
|
if method=="doubleQ": filename = filename + "_" + str(method) |
||||||
|
if RIC: filename = filename + "_RIC_" + str(RIC) |
||||||
|
Qout = Q1 + Q2 if method=="doubleQ" else Q |
||||||
|
np.save(filename, Qout) |
||||||
|
|
||||||
|
return Qout, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \ |
||||||
|
state_action_counter, chosen_actions, avg_Q_history, initial_coords, theta_history, obstacles |
||||||
|
|
||||||
|
#Plot |
||||||
|
|
||||||
|
#Número de pasos |
||||||
|
Ns = 40 |
||||||
|
#Número de episodios |
||||||
|
Ne=10 |
||||||
|
|
||||||
|
|
||||||
|
traj = [] |
||||||
|
|
||||||
|
|
||||||
|
my_alpha0 = 1.0 |
||||||
|
my_eps0 = 1.0 |
||||||
|
naction=100 |
||||||
|
stepsupdate = 75 |
||||||
|
kappa=2.5 |
||||||
|
alphaMAG=1 |
||||||
|
beta=1. |
||||||
|
gammaYUK= 2.5e-4 |
||||||
|
Pe=10000 |
||||||
|
dt=0.0002 |
||||||
|
ni=1. |
||||||
|
sigma=1. |
||||||
|
Q, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \ |
||||||
|
state_action_counter, chosen_actions, avg_Q_hist, initial_coords, theta_history, obstacles \ |
||||||
|
= training(my_alpha0, kappa, alphaMAG, beta, gammaYUK, Pe, dt, ni, sigma,Ns, Ne, naction, 0.999, 0.5, stepsupdate) |
||||||
|
|
||||||
|
#print(smart_stored_histories[1][1][:, 0]) |
||||||
|
#print(smart_stored_histories[0][1].shape) |
||||||
|
fig, ax= plt.subplots(1,1) |
||||||
|
ax.plot(np.array(traj[::2]), np.array(traj[1::2]), '.') |
||||||
|
ax.plot(np.array(obstacles[::2]), np.array(obstacles[1::2]), '.') |
||||||
|
|
||||||
|
for i in range(0, Ne//stepsupdate): |
||||||
|
ax.plot(smart_stored_histories[i][1][:, 0], smart_stored_histories[i][1][:, 1], '-', label='episode %d'%(stepsupdate*i), alpha=.7) |
||||||
|
if i == Ne-1: |
||||||
|
ax.plot(naive_stored_histories[i][1][:, 0], naive_stored_histories[i][1][:, 1], '-', label='naive spinner') |
||||||
|
|
||||||
|
|
||||||
|
ax.set_aspect('equal') |
||||||
|
ax.legend() |
||||||
|
#plt.show() |
||||||
|
fig.savefig('trajectories.png') |
||||||
|
|
||||||
|
|
Loading…
Reference in new issue