diff --git a/Documentos/TFG_Machine_Learning/Reinforce_Learning.py b/Documentos/TFG_Machine_Learning/Reinforce_Learning.py index f73da80..c343663 100644 --- a/Documentos/TFG_Machine_Learning/Reinforce_Learning.py +++ b/Documentos/TFG_Machine_Learning/Reinforce_Learning.py @@ -258,7 +258,7 @@ class Swimmer(Agent): # update coarse-grained state - self.update_state() + #self.update_state() self.t = 0 self.obstacles = obstacles @@ -411,9 +411,9 @@ class Swimmer(Agent): action_index = np.argmax(Q[state_index]) # find largest entry in this row of Q (i.e. this state) Wc=0.175*self.ni/(.5*self.sigma*self.sigma) if action_index == 0: # aumenta 1/8W - self.W[2] += 1./8*Wc + self.W[2] += .001/8*Wc elif action_index == 1: # disminuye 1/8W - self.W[2] -= 1./8*Wc + self.W[2] -= .001/8*Wc else: raise Exception ("Action index out of bounds: ", action_index) return action_index @@ -422,9 +422,9 @@ class Swimmer(Agent): action_index = np.random.randint(0, 2, 1) Wc=0.175*self.ni/(.5*self.sigma*self.sigma) if action_index == 0: # aumenta 1/8W - self.W[2] += 1./8*Wc + self.W[2] += .001/8*Wc else: # disminuye 1/8W - self.W[2] -= 1./8*Wc + self.W[2] -= .001/8*Wc return action_index def periodic_boundaries(self, isxperiodic=True, isyperiodic=True, iszperiodic=True): @@ -452,7 +452,7 @@ def tgv(x, z): w = -np.cos(x)*np.cos(z) return ux, uz, w -def training(alpha0,kappa,alphaMAG,beta,gammaYUK,Pe,dt, ni, sigma, Ns=4000, Ne=5000, gamma=0.999, eps0=0.0, n_updates=1000, \ +def training(alpha0,kappa,alphaMAG,beta,gammaYUK,Pe,dt, ni, sigma, Ns=4000, Ne=5000, Naction=100, gamma=0.999, eps0=0.0, n_updates=1000, \ RIC=False, method="Qlearning", lr_decay=None, omega=0.85, eps_decay=False, Qin=None): # n_updates - how often to plot the trajectory undertaken by the particle during the learning process # Ne - number of episodes @@ -548,9 +548,11 @@ def training(alpha0,kappa,alphaMAG,beta,gammaYUK,Pe,dt, ni, sigma, Ns=4000, Ne=5 old_s = state_lookup_table[smart.my_state] # given selected action, update the state - naive.interaction_with_obstacles(naive.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe,dt) - smart.interaction_with_obstacles(smart.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe,dt) + for step in range(Naction): + naive.interaction_with_obstacles(naive.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe,dt) + smart.interaction_with_obstacles(smart.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe,dt) smart.update_state() # only need to update smart particle since naive has ka = [0, 1] always + print(ep, smart.R, smart.W[2]) # calculate reward based on new state naive.calc_reward(stage) @@ -641,7 +643,7 @@ Q = np.random.rand(4, 2) print(Q) -Ns = 10000 +Ns = 100 spinner = Swimmer(Ns, 1, 1) traj = [] @@ -665,16 +667,17 @@ traj = [] my_alpha0 = 1.0 my_eps0 = 1.0 Ne=20 +naction=100 stepsupdate = 2 Q, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \ state_action_counter, chosen_actions, avg_Q_hist, initial_coords, theta_history, obstacles \ - = training(my_alpha0, 2.5, 1, 1., 2.5e-4, 10000, 0.00001, 1., 1.,Ns, Ne, 0.999, 0.0, stepsupdate) + = training(my_alpha0, 2.5, 1, 1., 2.5e-4, 10000, 0.00001, 1., 1.,Ns, Ne, naction, 0.999, 0.0, stepsupdate) #print(smart_stored_histories[1][1][3, :]) #print(len(smart_stored_histories), smart_stored_histories[0].shape) fig, ax= plt.subplots(1,1) #ax.plot(np.array(traj[::2]) + L/8., np.array(traj[1::2]) + L/8., '.') -ax.plot(np.array(obstacles[::2]) + L/8., np.array(obstacles[1::2]) + L/8., '.') +#ax.plot(np.array(obstacles[::2]) + L/8., np.array(obstacles[1::2]) + L/8., '.') for i in range(0, stepsupdate, Ne): ax.plot(smart_stored_histories[i][1][:, 0], smart_stored_histories[i][1][:, 1], '.', label='episode %d'%i)