Código limpio

2 years ago · 0c3bc404f0
parent 59e2b0291c
commit 0c3bc404f0
1 changed files with 543 additions and 0 deletions
--- a/Documentos/TFG_Machine_Learning/Reinforce_learning_actualizado.py
+++ b/Documentos/TFG_Machine_Learning/Reinforce_learning_actualizado.py
@ -0,0 +1,543 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb  6 19:02:32 2024
+
+@author: marta
+"""
+
+import numpy as np
+import math
+#import plotly.graph_objects as go
+#from tqdm.notebook import tqdm
+#import plotly.express as px
+import matplotlib as mpl
+mpl.rcParams['figure.dpi'] = 300
+import matplotlib.pyplot as plt
+#import seaborn as sns
+import os
+#from wand.image import Image as WImage
+# sns.set(palette="husl",font_scale=1)
+# %config InlineBackend.figure_format = 'retina'
+import copy
+np.random.seed(69732)
+#%load_ext line_profiler
+
+
+
+#Define constants
+
+#L = 2*np.pi # periodic domain size
+L=50
+# define boundaries of simulation box
+x0 = 0
+x1 = L
+z0 = 0
+z1 = L
+
+# define reinforcement learning problem
+N_states = 4 # number of states - one for each coarse-grained degree of vorticity
+N_actions = 2 # number of actions - one for each coarse-grained swimming direction
+
+# numerical parameters
+#dt = 0.00001 # timestep size
+
+
+
+#Define useful data structures
+#Define a dictionary of the possible states and their assigned indices
+
+distance_states = ["ri", "rni"] #ri es rij<rct y rni es rij>rct
+frecuency_states = ["wo", "wh"] #wo es w<wc y wh es w>wc
+product_states = [(x,y) for x in distance_states for y in frecuency_states]  # all possible states
+state_lookup_table = {product_states[i]:i for i in range(len(product_states))} # returns index of given state
+# print(product_states) # to view mapping
+
+#Define an agent class for reinforcement learning
+
+class Agent:
+    def __init__(self, Ns):
+        self.r = np.zeros(Ns) # reward for each stage
+        self.t = 0            # time
+
+    # calculate reward given from entering a new state after a selected action is undertaken
+    def calc_reward(self):
+        # enforce implementation by subclass
+        if self.__class__ == AbstractClass:
+                raise NotImplementedError
+
+    def update_state(self):
+        # enforce implementation by subclass
+        if self.__class__ == AbstractClass:
+                raise NotImplementedError
+
+    def take_random_action(self):
+        # enforce implementation by subclass
+        if self.__class__ == AbstractClass:
+                raise NotImplementedError
+
+    def take_greedy_action(self, Q):
+        # enforce implementation by subclass
+        if self.__class__ == AbstractClass:
+                raise NotImplementedError
+                
+#Define swimmer class derived from agent
+
+class Swimmer(Agent):
+    def __init__(self, Ns, ni, sigma):
+        # call init for superclass
+        super().__init__(Ns)
+        
+        self.ni = ni
+        self.sigma = sigma
+
+        #obstáculos
+        self.obstacles= self.generate_obstacles()
+       
+        #Condición inicial para el swimmer
+        self.X = np.array([np.random.uniform(-.5*L, .5*L), np.random.uniform(-.5*L, .5*L), 0])
+        self.init_pos = np.array([0., 0., 0.])
+        valid_initial_position = False
+        while not valid_initial_position:
+            self.X = np.array([np.random.uniform(-.5*L, .5*L), np.random.uniform(-.5*L, .5*L), 0])
+            valid_initial_position = True
+            # Comprobamos si esta inicialmente esta dentro de un obstáculo
+            for i in range(len(self.obstacles)//2):
+                obstacle_position = np.array([self.obstacles[2*i], self.obstacles[2*i+1], 0])
+                if np.linalg.norm(self.X - obstacle_position) < 0.8*self.sigma:
+                    valid_initial_position = False
+                    break
+        self.init_pos = self.X
+        # absolute position. -inf. <= x_total < inf. and -inf. <= z_total < inf.
+        self.X_total = self.X
+
+        # particle orientation
+        self.theta = np.random.uniform(0, 2*np.pi) # polar angle theta in the x-z plane
+        self.p = np.array([np.cos(self.theta), np.sin(self.theta)]) # p = [px, pz]^T
+
+        # translational and rotational velocity
+        self.U = np.zeros(3, float)
+        self.W = np.array([0., 0., 1.]) #Velocidad angular aleatoria
+        
+        #distancia entre el swimmer y el obstáculo
+        self.R=np.random.uniform(0, 2.5, 1)
+
+        # history of local and global position. Only store information for this episode.
+        self.history_X = [self.X]
+        self.history_X_total = [self.X_total]
+        
+
+        # update coarse-grained state
+        self.update_state()
+
+     
+
+    def generate_obstacles(self):
+        obstacles=[] #el numero de obstáculos será 10*10
+        cell_spacing= L/20.
+        ncells = 20
+
+        for i in range(ncells):
+          for j in range(ncells):
+            obstacle_x= i*cell_spacing - .5*L + .5*cell_spacing
+            obstacle_y= j*cell_spacing - .5*L + .5*cell_spacing
+            obstacles.append(obstacle_x)
+            obstacles.append(obstacle_y)
+
+        return obstacles
+   
+
+    def interaction_with_obstacles_numpy(self, obstacles, kappa, alpha, beta, gamma, Pe, Restar, dt):
+        F = np.array([0.,0.,0.])
+
+        #midpoint integration
+        nobs = len(obstacles)//2
+        mid_obs_array = np.array(obstacles).reshape(nobs, -1)  # nobs x 2 array
+        obs_array = np.hstack((mid_obs_array, np.zeros((nobs, 1)))) # add z=0 column
+        del mid_obs_array
+        r_nopbc = np.atleast_2d(self.X).repeat(nobs, axis=0) - obs_array
+        r = r_nopbc + L*np.floor(-r_nopbc/L + .5)
+        r[:, 2] = 0.
+        rnorm = np.linalg.norm(r, axis=1)
+        Re = (.5*self.sigma)**2*np.linalg.norm(self.W)/self.ni
+        S = np.reciprocal(1+np.exp(-kappa*(Re*Re*Re*np.power(rnorm, -3.)-Restar)))
+        F1 = alpha*np.sum(np.multiply(Re*Re*Re*np.power(rnorm, -3.)*rnorm, S))*np.cross(self.U,self.W)
+        F2 = beta*np.sum(np.multiply(np.cross(np.atleast_2d(self.W).repeat(nobs, axis=0), r), np.power(rnorm, -3.)[:, None]), axis=0)
+        F_attr = gamma*np.sum(((np.exp(-rnorm/kappa)*np.power(rnorm, -2))*(kappa+rnorm))[:, None]*r, axis=0)
+        F = F1+F2+F_attr
+
+        xi=np.random.normal(0,1, size=2) #vector de números aleatorios generados a partir de una distribución normal estándar con dos componentes, xi creo que es un vector de ruido estocástico (modela el ruido térmico)
+        dr_therm_1 = np.sqrt(2*self.sigma**2*(.5*dt)/Pe)*xi
+        xi=np.random.normal(0,1, size=2)
+        dr_therm_2 = np.sqrt(2*self.sigma**2*(.5*dt)/Pe)*xi
+
+        dr_mid = F[:-1]*.5*dt + dr_therm_1
+        self.U = np.array([2*dr_mid[0]/dt, 2*dr_mid[1]/dt, 0])
+
+        r_nopbc = np.atleast_2d(self.X + np.append(dr_mid, np.zeros(1))).repeat(nobs, axis=0) - obs_array
+        r = r_nopbc + L*np.floor(r_nopbc/L + .5)
+        r[:, 2] = 0.
+        rnorm = np.linalg.norm(r, axis=1)
+        Re = (.5*self.sigma)**2*np.linalg.norm(self.W)/self.ni
+        S = np.reciprocal(1+np.exp(-kappa*(Re*Re*Re*np.power(rnorm, -3.)-Restar)))
+        F1 = alpha*np.sum(np.multiply(Re*Re*Re*np.power(rnorm, -3.)*rnorm, S))*np.cross(self.U,self.W)
+        F2 = beta*np.sum(np.multiply(np.cross(np.atleast_2d(self.W).repeat(nobs, axis=0), r), np.power(rnorm, -3.)[:, None]), axis=0)
+        F_attr = gamma*np.sum(((np.exp(-rnorm/kappa)*np.power(rnorm, -2))*(kappa+rnorm))[:, None]*r, axis=0)
+        F = F1+F2+F_attr
+
+        dr = F[:-1]*dt + dr_therm_1 + dr_therm_2
+        #actualizamos la posición del spinner
+        self.X[:-1] += dr
+        self.X_total[:-1] += dr
+        self.U = np.array([dr[0]/dt, dr[1]/dt, 0])
+      
+        self.R = np.amin(rnorm)      
+        #comprobamos que el spinner siga dentro del box periódico
+        self.periodic_boundaries()
+
+
+    def reinitialize(self, obstacles):
+        # absolute position. -inf. <= x_total < inf. and -inf. <= z_total < inf.
+        self.X = self.init_pos
+        self.X_total = self.init_pos
+
+        # particle orientation
+        self.theta = np.random.uniform(0, 2*np.pi) # polar angle theta in the x-z plane
+        self.p = np.array([np.cos(self.theta), np.sin(self.theta)]) # p = [px, pz]^T
+
+        # translational and rotational velocity
+        self.U = np.zeros(3, float)
+        self.W = np.array([0., 0., 1.]) #Velocidad angular aleatoria
+        
+        #distancia entre el swimmer y el obstáculo
+        self.R=np.random.uniform(0, 2.5, 1)        
+
+        # history of local and global position. Only store information for this episode.
+        self.history_X.clear()
+        self.history_X_total.clear()
+        self.history_X.append(self.X)
+        self.history_X_total.append(self.X_total) 
+
+        # update coarse-grained state
+        #self.update_state()
+        
+        self.t = 0
+        self.obstacles = obstacles
+
+
+    def calc_reward(self, n):
+        first_r = self.history_X_total[-1][1]-self.history_X_total[-2][1]
+        if first_r >= .5*L:
+            print('low PBC')
+            self.r[n] = first_r - 1.*L
+        elif first_r <= -.5*L:
+            print('high PBC')
+            self.r[n] = first_r + 1.*L
+        else:
+            self.r[n] = first_r
+
+    def update_state(self):
+        
+        #self.distance_obstacles()
+        #componente z de la velocidad angular
+        W_z = self.W[2]
+        if W_z <= 0.175*self.ni/(.5*self.sigma*self.sigma):
+            W_state = "wo"
+        elif W_z > 0.175*self.ni/(.5*self.sigma*self.sigma):
+            W_state = "wh"
+        else:
+            raise Exception("Invalid value of w detected: ", W_z)
+        
+        if self.R <= 1.25:
+            R_state = "ri"
+        elif self.R > 1.25:
+            R_state = "rni"
+        else:
+            raise Exception ("Invalid value of r detected: ", self.R)
+       
+
+        self.my_state = (R_state, W_state)
+
+    def take_greedy_action(self, Q):
+        state_index = state_lookup_table[self.my_state]
+        action_index = np.argmax(Q[state_index])  # find largest entry in this row of Q (i.e. this state)
+        Wc=0.175*self.ni/(.5*self.sigma*self.sigma)
+        if action_index == 0:   # aumenta 0.01/8W
+            self.W[2] += .01/8*Wc 
+        elif action_index == 1: # disminuye 0.01/8W
+            self.W[2] -= .01/8*Wc 
+        else:
+            raise Exception ("Action index out of bounds: ", action_index)
+        return action_index
+
+    def take_random_action(self):
+        action_index = np.random.randint(0, 2, 1)
+        Wc=0.175*self.ni/(.5*self.sigma*self.sigma)
+        if action_index == 0:   # aumenta 0.01/8W
+            self.W[2] += .01/8*Wc 
+        else:                   # disminuye 0.01/8W
+            self.W[2] -= .01/8*Wc 
+        return action_index
+
+    def periodic_boundaries(self, isxperiodic=True, isyperiodic=True, iszperiodic=True):
+   
+        offset = [math.floor(-self.X[0] * 1./L + 0.5),
+                  math.floor(-self.X[1] * 1./L + 0.5),
+                  0]
+    
+        if isxperiodic:
+            self.X[0] += offset[0] * L
+        if isyperiodic:
+            self.X[1] += offset[1] * L
+        if iszperiodic:
+            self.X[2] += offset[2] * L
+
+ 
+
+
+def training(alpha0,kappa,alphaMAG,beta,gammaYUK,Pe,dt, ni, sigma, Ns=4000, Ne=5000, Naction=100, gamma=0.999, eps0=0.0, n_updates=1000, \
+             RIC=False, method="Qlearning", lr_decay=None, omega=0.85, eps_decay=True, Qin=None):
+    # n_updates - how often to plot the trajectory undertaken by the particle during the learning process
+    # Ne - number of episodes
+    # Ns - number of steps in an episode
+    # alpha0 - learning rate (or starting learning rate when employing LR decay)
+    # gamma - discount factor, i.e. how much we weigh future to present rewards. Close to 0 = myopic view.
+    # eps0 - fraction of the time we allow for exploration in selecting the following action. 0 = always greedy.
+    # D0 - translational diffusivity
+    # Dr - rotational diffusivity
+    # RIC - Reset of Initial Conditions. First time a state-action pair is encountered, set Q[s,a] = reward
+    # method - choose from Q-learning, Double Q-learning (, or Expected SARSA
+    # lr_decay - whether or not to use learning rate decay. Options are none, or polynomial (lr=1/#(s,a)**omega)
+    # omega - exponent used in lr_decay: lr = 1/#(s,a)**omega
+    # eps_decay - whether or not to decay epsilon linearly: eff_eps = eps0/k for the k-th step
+    # Qin - initial Q matrix. Useful for testing performance after an extensive exploration phase.
+
+    # if using the expected SARSA method, turn on epsilon decay since eps = 0 is simply Q-learning anyway
+    if method=="expSARSA":
+        eps_decay = True
+        if eps0 == 0: eps0 = 1
+
+    # Total reward for each episode
+    hist_R_tot_smart = np.zeros(Ne)
+    hist_R_tot_naive= np.zeros(Ne)
+
+    # learning gain per episode
+    Σ = np.zeros(Ne)
+
+    smart_stored_histories = []       # store position = f(t) every so often for an episode (smart particles)
+    naive_stored_histories = []       # store position = f(t) every so often for an episode (naive particles)
+
+    # number of times each state-action pair has been explored
+    state_action_counter = np.zeros((N_states,N_actions))
+
+    # initialize a naive and a smart gyrotactic particle
+    naive = Swimmer(Ns, ni, sigma)
+    smart = Swimmer(Ns, ni, sigma)
+    naive.obstacles = smart.obstacles
+    obstacles=naive.obstacles.copy()
+    init_pos = copy.deepcopy(smart.X)
+    print(init_pos.base is None)
+    # initialize Q matrix to large value
+    if method=="doubleQ":
+        Q1 = L*Ns*np.ones((4, 2))
+        Q2 = L*Ns*np.ones((4, 2))
+    else:
+        Q = L*Ns*np.ones((4, 2))   # 4 states, 2 possible actions. Each column is an action, w.
+
+    if Qin is not None: Q = Qin
+
+    # store average Q for each episode to track convergence
+    avg_Q_history = np.zeros((Ne,4,2))
+
+    # store initial position and orientation for each episode
+    initial_coords = np.empty([Ne, 3], float)
+    for k in range(Ne):
+        initial_coords[k,:]=smart.X
+
+    # iterate over episodes
+    k = 0
+    for ep in range(Ne):
+
+        # assign random orientation and position
+        #print(init_pos)
+        #print(smart.X)
+        smart.init_pos = init_pos.copy()
+        naive.init_pos = init_pos.copy()
+        smart.reinitialize(obstacles)
+        naive.reinitialize(obstacles)
+        naive = copy.deepcopy(smart) # have naive and smart share initial conditions for visualization purposes
+
+        # store initialization
+        initial_coords[ep,0:3] = smart.X
+
+        # save selected actions and particle orientation for last episodes
+        if ep == Ne - 1:
+            chosen_actions = np.zeros(Ns)
+            theta_history = np.zeros(Ns)
+
+        # iterate over stages within an episode
+        #we have to store data like this because otherwise all history will be rewritten by the final element???
+        smart_history_X_total = np.zeros((Ns, 3), float)
+        naive_history_X_total = np.zeros((Ns, 3), float)
+        for stage in range(Ns):
+
+            # select an action eps-greedily. Note naive never changes its action/strategy (i.e. trying to swim up)
+            Qinput = Q1 + Q2 if method=="doubleQ" else Q
+            k = k + 1 # k-th update
+
+            eff_eps = eps0/k**omega if eps_decay else eps0 # decrease amount of exploration as time proceeds
+            if np.random.uniform(0, 1) < eff_eps:
+                action = smart.take_random_action()
+            else:
+                action = smart.take_greedy_action(Qinput)
+
+            # record action and orientation on last episode
+            if ep == Ne - 1:
+                chosen_actions[stage] = action
+                theta_history[stage] = smart.theta
+
+            # record index of the prior state
+            old_s = state_lookup_table[smart.my_state]
+
+            # given selected action, update the state
+            for step in range(Naction):
+                naive.interaction_with_obstacles_numpy(naive.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe, .094, dt)
+                smart.interaction_with_obstacles_numpy(smart.obstacles, kappa,alphaMAG,beta,gammaYUK,Pe, .094, dt)
+            
+            #print(init_pos)
+            smart.history_X_total.append(smart.X_total.copy())
+            #print('works', smart.history_X_total[-1])
+            smart.history_X.append(smart.X.copy())
+            naive.history_X_total.append(naive.X_total.copy())
+            naive.history_X.append(naive.X.copy())
+            smart_history_X_total[stage, :] = smart.X_total.copy()
+            naive_history_X_total[stage, :] = naive.X_total.copy()
+            #print(smart.history_X_total)
+            smart.update_state()      # only need to update smart particle since naive has ka = [0, 1] always
+            #print(ep, smart.R, smart.W[2])
+
+            # calculate reward based on new state
+            naive.calc_reward(stage)
+            smart.calc_reward(stage)
+
+            new_s = state_lookup_table[smart.my_state]
+            state_action_counter[new_s,action] += 1
+
+            # employ learning rate decay if applicable
+            alpha = alpha0/(1+state_action_counter[old_s,action])**omega if lr_decay else alpha0
+
+            # update Q matrix
+            if method=="doubleQ":
+                if np.random.uniform(0, 1) < 0.5: # update Q1
+                    if Q1[old_s, action] == L*Ns and RIC==True: # apply Reset of Initial Conditions (RIC)
+                        Q1[old_s, action] = smart.r[stage]
+                    else:
+                        Q1[old_s, action] = Q1[old_s, action] + alpha*(smart.r[stage] + \
+                                gamma*np.max(Q2[new_s,:])-Q1[old_s,action])
+                else: # update Q2
+                    if Q2[old_s, action] == L*Ns and RIC==True:
+                        Q2[old_s, action] = smart.r[stage]
+                    else:
+                        Q2[old_s, action] = Q2[old_s, action] + alpha*(smart.r[stage] + \
+                                gamma*np.max(Q1[new_s,:])-Q2[old_s,action])
+            if method=="expSARSA":
+                # calculate V, the expected Q value for the next state-actio pair
+                V = 0
+                greedy_action = np.argmax(Q[new_s]) # would-be greedy action for new state
+                for new_action in range(N_actions):
+                    pi = (1 - eff_eps) + eff_eps/N_actions if new_action == greedy_action else eff_eps/N_actions
+                    V = V + pi*Q[new_s, new_action]
+
+                if Q[old_s, action] == L*Ns and RIC==True:
+                    Q[old_s, action] = smart.r[stage]
+                else:
+                    Q[old_s, action] = Q[old_s, action] + alpha*(smart.r[stage] + gamma*V - Q[old_s,action])
+            else:
+                if Q[old_s, action] == L*Ns and RIC==True:
+                    Q[old_s, action] = smart.r[stage]
+                else:
+                    Q[old_s, action] = Q[old_s, action] + alpha*(smart.r[stage] + \
+                            gamma*np.max(Q[new_s,:])-Q[old_s,action])
+
+            #print('test', smart.history_X_total[-1])
+            # store average Q for each episode to track convergence
+            avg_Q_history[ep] = avg_Q_history[ep] + Q1 + Q2 if method=="doubleQ" else avg_Q_history[ep] + Q
+
+        avg_Q_history[ep] = avg_Q_history[ep]/Ns
+
+        # calculate Rtot for this episode
+        R_tot_naive = np.sum(naive.r)
+        R_tot_smart = np.sum(smart.r)
+        print('Episode %d, reward: %.5f'%(ep, R_tot_smart))
+
+        if R_tot_naive < .0000001:
+            R_tot_naive = .0000001
+        # calculate learning gain for this episode
+        Σ[ep] = R_tot_smart/R_tot_naive - 1
+        hist_R_tot_smart[ep] = R_tot_smart
+        hist_R_tot_naive[ep] = R_tot_naive
+
+        # plot trajectory every so often
+        if ep%n_updates==0 or ep==Ne-1:
+            smart_stored_histories.append((ep,smart_history_X_total))
+            naive_stored_histories.append((ep,naive_history_X_total))
+
+        # save optimal policy
+        if ep==Ne-1:
+            filename = "Policies/Q_alpha_" + str(alpha).replace(".","d") + "_Ns_" + str(Ns) + "_Ne_" + str(Ne) + \
+                    "_sigma_" + str(sigma).replace(".","d") + "_Pe_" + str(Pe).replace(".","d") + "_eps_" \
+                    + str(eff_eps).replace(".","d") + "_epsdecay_" + str(eps_decay)
+            if lr_decay: filename = filename + "_omega_" + str(omega)
+            if method=="doubleQ": filename = filename + "_" + str(method)
+            if RIC: filename = filename + "_RIC_" + str(RIC)
+            Qout = Q1 + Q2 if method=="doubleQ" else Q
+            np.save(filename, Qout)
+
+    return Qout, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \
+        state_action_counter, chosen_actions, avg_Q_history, initial_coords, theta_history, obstacles
+
+#Plot
+
+#Número de pasos
+Ns = 40
+#Número de episodios
+Ne=10
+
+
+traj = []
+
+
+my_alpha0 = 1.0
+my_eps0 = 1.0
+naction=100
+stepsupdate = 75
+kappa=2.5
+alphaMAG=1
+beta=1.
+gammaYUK= 2.5e-4
+Pe=10000
+dt=0.0002
+ni=1.
+sigma=1.
+Q, Σ, smart, naive, hist_R_tot_smart, hist_R_tot_naive, smart_stored_histories, naive_stored_histories, \
+        state_action_counter, chosen_actions, avg_Q_hist, initial_coords, theta_history, obstacles \
+            = training(my_alpha0, kappa, alphaMAG, beta, gammaYUK, Pe, dt, ni, sigma,Ns, Ne, naction, 0.999, 0.5, stepsupdate)
+            
+#print(smart_stored_histories[1][1][:, 0])
+#print(smart_stored_histories[0][1].shape)
+fig, ax= plt.subplots(1,1)
+ax.plot(np.array(traj[::2]), np.array(traj[1::2]), '.')
+ax.plot(np.array(obstacles[::2]), np.array(obstacles[1::2]), '.')
+
+for i in range(0, Ne//stepsupdate):
+    ax.plot(smart_stored_histories[i][1][:, 0], smart_stored_histories[i][1][:, 1], '-', label='episode %d'%(stepsupdate*i), alpha=.7)
+    if i == Ne-1:
+        ax.plot(naive_stored_histories[i][1][:, 0], naive_stored_histories[i][1][:, 1], '-', label='naive spinner')
+
+
+ax.set_aspect('equal')
+ax.legend()
+#plt.show()
+fig.savefig('trajectories.png')
+    
+