基于carla和python的自动驾驶仿真系列5

    科技2022-07-13  140

    欢迎来到自动驾驶汽车和增强学习的第5部分,Carla, Python和TensorFlow。 现在我们已经有了环境和代理,我们只需要添加一些逻辑来将它们联系在一起,这就是我们接下来要做的。 完整的代码到这一点:

    import glob import os import sys import random import time import numpy as np import cv2 import math from collections import deque from keras.applications.xception import Xception from keras.layers import Dense, GlobalAveragePooling2D from keras.optimizers import Adam from keras.models import Model try: sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % ( sys.version_info.major, sys.version_info.minor, 'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0]) except IndexError: pass import carla SHOW_PREVIEW = False IM_WIDTH = 640 IM_HEIGHT = 480 SECONDS_PER_EPISODE = 10 REPLAY_MEMORY_SIZE = 5_000 MIN_REPLAY_MEMORY_SIZE = 1_000 MINIBATCH_SIZE = 16 PREDICTION_BATCH_SIZE = 1 TRAINING_BATCH_SIZE = MINIBATCH_SIZE // 4 UPDATE_TARGET_EVERY = 5 MODEL_NAME = "Xception" MEMORY_FRACTION = 0.8 MIN_REWARD = -200 EPISODES = 100 DISCOUNT = 0.99 epsilon = 1 EPSILON_DECAY = 0.95 ## 0.9975 99975 MIN_EPSILON = 0.001 AGGREGATE_STATS_EVERY = 10 class CarEnv: SHOW_CAM = SHOW_PREVIEW STEER_AMT = 1.0 im_width = IM_WIDTH im_height = IM_HEIGHT front_camera = None def __init__(self): self.client = carla.Client("localhost", 2000) self.client.set_timeout(2.0) self.world = self.client.get_world() self.blueprint_library = self.world.get_blueprint_library() self.model_3 = self.blueprint_library.filter("model3")[0] def reset(self): self.collision_hist = [] self.actor_list = [] self.transform = random.choice(self.world.get_map().get_spawn_points()) self.vehicle = self.world.spawn_actor(self.model_3, self.transform) self.actor_list.append(self.vehicle) self.rgb_cam = self.blueprint_library.find('sensor.camera.rgb') self.rgb.set_attribute("image_size_x", f"{self.im_width}") self.rgb.set_attribute("image_size_y", f"{self.im_height}") self.rgb.set_attribute("fov", f"110") transform = carla.Transform(carla.Location(x=2.5, z=0.7)) self.sensor = self.world.spawn_actor(self.rgb_cam, transform, attach_to=self.vehicle) self.actor_list.append(self.sensor) self.sensor.listen(lambda data: self.process_img(data)) self.vehicle.apply_control(carla.VehicleControl(throttle=0.0, brake=0.0)) time.sleep(4) colsensor = self.blueprint_library.find("sensor.other.collision") self.colsensor = self.world.spawn_actor(colsensor, transform, attach_to=self.vehicle) self.actor_list.append(self.colsensor) self.colsensor.listen(lambda event: self.collision_data(event)) while self.front_camera is None: time.sleep(0.01) self.episode_start = time.time() self.vehicle.apply_control(carla.VehicleControl(throttle=0.0, brake=0.0)) return self.front_camera def collision_data(self, event): self.collision_hist.append(event) def process_img(self, image): i = np.array(image.raw_data) #print(i.shape) i2 = i.reshape((self.im_height, self.im_width, 4)) i3 = i2[:, :, :3] if self.SHOW_CAM: cv2.imshow("", i3) cv2.waitKey(1) self.front_camera = i3 def step(self, action): if action == 0: self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer=-1*self.STEER_AMT)) elif action == 1: self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer= 0)) elif action == 2: self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer=1*self.STEER_AMT)) v = self.vehicle.get_velocity() kmh = int(3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2)) if len(self.collision_hist) != 0: done = True reward = -200 elif kmh < 50: done = False reward = -1 else: done = False reward = 1 if self.episode_start + SECONDS_PER_EPISODE < time.time(): done = True return self.front_camera, reward, done, None class DQNAgent: def __init__(self): self.model = self.create_model() self.target_model = self.create_model() self.target_model.set_weights(self.model.get_weights()) self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-{int(time.time())}") self.target_update_counter = 0 self.graph = tf.get_default_graph() self.terminate = False self.last_logged_episode = 0 self.training_initialized = False def create_model(self): base_model = Xception(weights=None, include_top=False, input_shape=(IM_HEIGHT, IM_WIDTH,3)) x = base_model.output x = GlobalAveragePooling2D()(x) predictions = Dense(3, activation="linear")(x) model = Model(inputs=base_model.input, outputs=predictions) model.compile(loss="mse", optimizer=Adam(lr=0.001), metrics=["accuracy"]) return model def update_replay_memory(self, transition): # transition = (current_state, action, reward, new_state, done) self.replay_memory.append(transition) def train(self): if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE) current_states = np.array([transition[0] for transition in minibatch])/255 with self.graph.as_default(): current_qs_list = self.model.predict(current_states, PREDICTION_BATCH_SIZE) new_current_states = np.array([transition[3] for transition in minibatch])/255 with self.graph.as_default(): future_qs_list = self.target_model.predict(new_current_states, PREDICTION_BATCH_SIZE) X = [] y = [] for index, (current_state, action, reward, new_state, done) in enumerate(minibatch): if not done: max_future_q = np.max(future_qs_list[index]) new_q = reward + DISCOUNT * max_future_q else: new_q = reward current_qs = current_qs_list[index] current_qs[action] = new_q X.append(current_state) y.append(current_qs) log_this_step = False if self.tensorboard.step > self.last_logged_episode: log_this_step = True self.last_log_episode = self.tensorboard.step with self.graph.as_default(): self.model.fit(np.array(X)/255, np.array(y), batch_size=TRAINING_BATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if log_this_step else None) if log_this_step: self.target_update_counter += 1 if self.target_update_counter > UPDATE_TARGET_EVERY: self.target_model.set_weights(self.model.get_weights()) self.target_update_counter = 0 def get_qs(self, state): return self.model.predict(np.array(state).reshape(-1 *state.shape)/255)[0] def train_in_loop(self): X = np.random.uniform(size=(1, IM_HEIGHT, IM_WIDTH, 3)).astype(np.float32) y = np.random.uniform(size=(1, 3)).astype(np.float32) with self.graph.as_default(): self.model.fit(X,y, verbose=False, batch_size=1) self.training_initialized = True while True: if self.terminate: return self.train() time.sleep(0.01)

    首先,我们将从强化学习教程中复制粘贴修改后的tensorboard类:

    from keras.callbacks import TensorBoard ... # Own Tensorboard class class ModifiedTensorBoard(TensorBoard): # Overriding init to set initial step and writer (we want one log file for all .fit() calls) def __init__(self, **kwargs): super().__init__(**kwargs) self.step = 1 self.writer = tf.summary.FileWriter(self.log_dir) # Overriding this method to stop creating default log writer def set_model(self, model): pass # Overrided, saves logs with our step number # (otherwise every .fit() will start writing from 0th step) def on_epoch_end(self, epoch, logs=None): self.update_stats(**logs) # Overrided # We train for one batch only, no need to save anything at epoch end def on_batch_end(self, batch, logs=None): pass # Overrided, so won't close writer def on_train_end(self, _): pass # Custom method for saving own metrics # Creates writer, writes custom metrics and closes writer def update_stats(self, **stats): self._write_logs(stats, self.step)

    提醒一下,上面的代码只是为了简化TensorFlow/TensorBoard所做的日志量。通常情况下,每个装配都有一个日志文件,每个步骤都有一个数据点,这在强化学习中变得非常荒谬,而且非常快(您可以安装每一步!) 让我们添加以下导入:

    import tensorflow as tf import keras.backend.tensorflow_backend as backend from threading import Thread

    在那之后,我们将进入我们的脚本底部:

    if __name__ == '__main__': FPS = 60 # For stats ep_rewards = [-200] # For more repetitive results random.seed(1) np.random.seed(1) tf.set_random_seed(1) # Memory fraction, used mostly when trai8ning multiple agents gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION) backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))) # Create models folder if not os.path.isdir('models'): os.makedirs('models') # Create agent and environment agent = DQNAgent() env = CarEnv() if __name__ == '__main__': FPS = 60 # For stats ep_rewards = [-200] # For more repetitive results random.seed(1) np.random.seed(1) tf.set_random_seed(1) # Memory fraction, used mostly when trai8ning multiple agents gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION) backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))) # Create models folder if not os.path.isdir('models'): os.makedirs('models') # Create agent and environment agent = DQNAgent() env = CarEnv()

    首先,我们设置一些FPS值(每秒帧数)。一开始,我们会有很高的探索率,也就是说我们随机选择一个动作的概率很高,而不是用神经网络预测。随机选择要比预测操作快得多,所以我们可以通过设置一些一般的FPS来任意延迟。当为0时,你应该将它设为实际的FPS。我们将为可重复的结果设置随机种子,然后指定GPU内存分数。您可能不需要这样做,但我的RTX Titan似乎有问题,至少在Windows上,当它试图分配尽可能多的内存时,会耗尽内存。 接下来,如果模型目录还不存在,我们将创建模型目录。然后创建代理和环境类。

    # Start training thread and wait for training to be initialized trainer_thread = Thread(target=agent.train_in_loop, daemon=True) trainer_thread.start() while not agent.training_initialized: time.sleep(0.01)

    如注释所示,启动训练线程并等待训练初始化!

    # Initialize predictions - forst prediction takes longer as of initialization that has to be done # It's better to do a first prediction then before we start iterating over episode steps agent.get_qs(np.ones((env.im_height, env.im_width, 3)))

    现在,我们准备开始迭代我们设定的每一集:

    # Iterate over episodes for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'): #try: env.collision_hist = [] # Update tensorboard step every episode agent.tensorboard.step = episode # Restarting episode - reset episode reward and step number episode_reward = 0 step = 1 # Reset environment and get initial state current_state = env.reset() # Reset flag and start iterating until episode ends done = False episode_start = time.time()

    我们的环境的一些初始值,现在我们准备运行。一个环境会一直运行直到它完成,所以我们可以使用While True loop并在done标志上中断。 当我们玩游戏的时候,我们想要采取一个随机的行动,或者根据我们的代理模型找出我们当前的行动:

    # Play for given number of seconds only while True: # This part stays mostly the same, the change is to query a model for Q values if np.random.random() > epsilon: # Get action from Q table action = np.argmax(agent.get_qs(current_state)) else: # Get random action action = np.random.randint(0, 3) # This takes no time, so we add a delay matching 60 FPS (prediction above takes longer) time.sleep(1/FPS)

    现在,我们将会得到我们的信息从我们的环境的.step()方法,以我们的行动作为一个参数:

    new_state, reward, done, _ = env.step(action) # Transform new continous state to new discrete state and count reward episode_reward += reward # Every step we update replay memory agent.update_replay_memory((current_state, action, reward, new_state, done)) current_state = new_state step += 1 if done: break

    完成之后,我们还需要做什么?首先,我们需要摆脱我们的演员:

    # End of episode - destroy agents for actor in env.actor_list: actor.destroy()

    现在为一些统计+保存模式,有一个好的奖励(或任何其他规则,你决定设置为一个if声明:

    # Append episode reward to a list and log stats (every given number of episodes) ep_rewards.append(episode_reward) if not episode % AGGREGATE_STATS_EVERY or episode == 1: average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:]) min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:]) max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:]) agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon) # Save model, but only when min reward is greater or equal a set value if min_reward >= MIN_REWARD: agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    接下来,让我们衰减探索率:

    # Decay epsilon if epsilon > MIN_EPSILON: epsilon *= EPSILON_DECAY epsilon = max(MIN_EPSILON, epsilon)

    最后,如果我们实际上已经遍历了所有目标集,我们可以退出:

    # Set termination flag for training thread and wait for it to finish agent.terminate = True trainer_thread.join() agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    也许我们做完了。 完整的代码到这一点:

    import glob import os import sys import random import time import numpy as np import cv2 import math from collections import deque from keras.applications.xception import Xception from keras.layers import Dense, GlobalAveragePooling2D from keras.optimizers import Adam from keras.models import Model from keras.callbacks import TensorBoard import tensorflow as tf import keras.backend.tensorflow_backend as backend from threading import Thread from tqdm import tqdm try: sys.path.append(glob.glob('../carla/dist/carla-*%d.%d-%s.egg' % ( sys.version_info.major, sys.version_info.minor, 'win-amd64' if os.name == 'nt' else 'linux-x86_64'))[0]) except IndexError: pass import carla SHOW_PREVIEW = False IM_WIDTH = 640 IM_HEIGHT = 480 SECONDS_PER_EPISODE = 10 REPLAY_MEMORY_SIZE = 5_000 MIN_REPLAY_MEMORY_SIZE = 1_000 MINIBATCH_SIZE = 16 PREDICTION_BATCH_SIZE = 1 TRAINING_BATCH_SIZE = MINIBATCH_SIZE // 4 UPDATE_TARGET_EVERY = 5 MODEL_NAME = "Xception" MEMORY_FRACTION = 0.4 MIN_REWARD = -200 EPISODES = 100 DISCOUNT = 0.99 epsilon = 1 EPSILON_DECAY = 0.95 ## 0.9975 99975 MIN_EPSILON = 0.001 AGGREGATE_STATS_EVERY = 10 # Own Tensorboard class class ModifiedTensorBoard(TensorBoard): # Overriding init to set initial step and writer (we want one log file for all .fit() calls) def __init__(self, **kwargs): super().__init__(**kwargs) self.step = 1 self.writer = tf.summary.FileWriter(self.log_dir) # Overriding this method to stop creating default log writer def set_model(self, model): pass # Overrided, saves logs with our step number # (otherwise every .fit() will start writing from 0th step) def on_epoch_end(self, epoch, logs=None): self.update_stats(**logs) # Overrided # We train for one batch only, no need to save anything at epoch end def on_batch_end(self, batch, logs=None): pass # Overrided, so won't close writer def on_train_end(self, _): pass # Custom method for saving own metrics # Creates writer, writes custom metrics and closes writer def update_stats(self, **stats): self._write_logs(stats, self.step) class CarEnv: SHOW_CAM = SHOW_PREVIEW STEER_AMT = 1.0 im_width = IM_WIDTH im_height = IM_HEIGHT front_camera = None def __init__(self): self.client = carla.Client("localhost", 2000) self.client.set_timeout(2.0) self.world = self.client.get_world() self.blueprint_library = self.world.get_blueprint_library() self.model_3 = self.blueprint_library.filter("model3")[0] def reset(self): self.collision_hist = [] self.actor_list = [] self.transform = random.choice(self.world.get_map().get_spawn_points()) self.vehicle = self.world.spawn_actor(self.model_3, self.transform) self.actor_list.append(self.vehicle) self.rgb_cam = self.blueprint_library.find('sensor.camera.rgb') self.rgb_cam.set_attribute("image_size_x", f"{self.im_width}") self.rgb_cam.set_attribute("image_size_y", f"{self.im_height}") self.rgb_cam.set_attribute("fov", f"110") transform = carla.Transform(carla.Location(x=2.5, z=0.7)) self.sensor = self.world.spawn_actor(self.rgb_cam, transform, attach_to=self.vehicle) self.actor_list.append(self.sensor) self.sensor.listen(lambda data: self.process_img(data)) self.vehicle.apply_control(carla.VehicleControl(throttle=0.0, brake=0.0)) time.sleep(4) colsensor = self.blueprint_library.find("sensor.other.collision") self.colsensor = self.world.spawn_actor(colsensor, transform, attach_to=self.vehicle) self.actor_list.append(self.colsensor) self.colsensor.listen(lambda event: self.collision_data(event)) while self.front_camera is None: time.sleep(0.01) self.episode_start = time.time() self.vehicle.apply_control(carla.VehicleControl(throttle=0.0, brake=0.0)) return self.front_camera def collision_data(self, event): self.collision_hist.append(event) def process_img(self, image): i = np.array(image.raw_data) #print(i.shape) i2 = i.reshape((self.im_height, self.im_width, 4)) i3 = i2[:, :, :3] if self.SHOW_CAM: cv2.imshow("", i3) cv2.waitKey(1) self.front_camera = i3 def step(self, action): if action == 0: self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer=-1*self.STEER_AMT)) elif action == 1: self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer= 0)) elif action == 2: self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, steer=1*self.STEER_AMT)) v = self.vehicle.get_velocity() kmh = int(3.6 * math.sqrt(v.x**2 + v.y**2 + v.z**2)) if len(self.collision_hist) != 0: done = True reward = -200 elif kmh < 50: done = False reward = -1 else: done = False reward = 1 if self.episode_start + SECONDS_PER_EPISODE < time.time(): done = True return self.front_camera, reward, done, None class DQNAgent: def __init__(self): self.model = self.create_model() self.target_model = self.create_model() self.target_model.set_weights(self.model.get_weights()) self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) self.tensorboard = ModifiedTensorBoard(log_dir=f"logs/{MODEL_NAME}-{int(time.time())}") self.target_update_counter = 0 self.graph = tf.get_default_graph() self.terminate = False self.last_logged_episode = 0 self.training_initialized = False def create_model(self): base_model = Xception(weights=None, include_top=False, input_shape=(IM_HEIGHT, IM_WIDTH,3)) x = base_model.output x = GlobalAveragePooling2D()(x) predictions = Dense(3, activation="linear")(x) model = Model(inputs=base_model.input, outputs=predictions) model.compile(loss="mse", optimizer=Adam(lr=0.001), metrics=["accuracy"]) return model def update_replay_memory(self, transition): # transition = (current_state, action, reward, new_state, done) self.replay_memory.append(transition) def train(self): if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE) current_states = np.array([transition[0] for transition in minibatch])/255 with self.graph.as_default(): current_qs_list = self.model.predict(current_states, PREDICTION_BATCH_SIZE) new_current_states = np.array([transition[3] for transition in minibatch])/255 with self.graph.as_default(): future_qs_list = self.target_model.predict(new_current_states, PREDICTION_BATCH_SIZE) X = [] y = [] for index, (current_state, action, reward, new_state, done) in enumerate(minibatch): if not done: max_future_q = np.max(future_qs_list[index]) new_q = reward + DISCOUNT * max_future_q else: new_q = reward current_qs = current_qs_list[index] current_qs[action] = new_q X.append(current_state) y.append(current_qs) log_this_step = False if self.tensorboard.step > self.last_logged_episode: log_this_step = True self.last_log_episode = self.tensorboard.step with self.graph.as_default(): self.model.fit(np.array(X)/255, np.array(y), batch_size=TRAINING_BATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if log_this_step else None) if log_this_step: self.target_update_counter += 1 if self.target_update_counter > UPDATE_TARGET_EVERY: self.target_model.set_weights(self.model.get_weights()) self.target_update_counter = 0 def get_qs(self, state): return self.model.predict(np.array(state).reshape(-1, *state.shape)/255)[0] def train_in_loop(self): X = np.random.uniform(size=(1, IM_HEIGHT, IM_WIDTH, 3)).astype(np.float32) y = np.random.uniform(size=(1, 3)).astype(np.float32) with self.graph.as_default(): self.model.fit(X,y, verbose=False, batch_size=1) self.training_initialized = True while True: if self.terminate: return self.train() time.sleep(0.01) if __name__ == '__main__': FPS = 60 # For stats ep_rewards = [-200] # For more repetitive results random.seed(1) np.random.seed(1) tf.set_random_seed(1) # Memory fraction, used mostly when trai8ning multiple agents gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION) backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))) # Create models folder if not os.path.isdir('models'): os.makedirs('models') # Create agent and environment agent = DQNAgent() env = CarEnv() # Start training thread and wait for training to be initialized trainer_thread = Thread(target=agent.train_in_loop, daemon=True) trainer_thread.start() while not agent.training_initialized: time.sleep(0.01) # Initialize predictions - forst prediction takes longer as of initialization that has to be done # It's better to do a first prediction then before we start iterating over episode steps agent.get_qs(np.ones((env.im_height, env.im_width, 3))) # Iterate over episodes for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'): #try: env.collision_hist = [] # Update tensorboard step every episode agent.tensorboard.step = episode # Restarting episode - reset episode reward and step number episode_reward = 0 step = 1 # Reset environment and get initial state current_state = env.reset() # Reset flag and start iterating until episode ends done = False episode_start = time.time() # Play for given number of seconds only while True: # This part stays mostly the same, the change is to query a model for Q values if np.random.random() > epsilon: # Get action from Q table action = np.argmax(agent.get_qs(current_state)) else: # Get random action action = np.random.randint(0, 3) # This takes no time, so we add a delay matching 60 FPS (prediction above takes longer) time.sleep(1/FPS) new_state, reward, done, _ = env.step(action) # Transform new continous state to new discrete state and count reward episode_reward += reward # Every step we update replay memory agent.update_replay_memory((current_state, action, reward, new_state, done)) current_state = new_state step += 1 if done: break # End of episode - destroy agents for actor in env.actor_list: actor.destroy() # Append episode reward to a list and log stats (every given number of episodes) ep_rewards.append(episode_reward) if not episode % AGGREGATE_STATS_EVERY or episode == 1: average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:]) min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:]) max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:]) agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon) # Save model, but only when min reward is greater or equal a set value if min_reward >= MIN_REWARD: agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model') # Decay epsilon if epsilon > MIN_EPSILON: epsilon *= EPSILON_DECAY epsilon = max(MIN_EPSILON, epsilon) # Set termination flag for training thread and wait for it to finish agent.terminate = True trainer_thread.join() agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    让我们来运行它,它将运行100集。100集在泰坦RTX上花了17分钟。 你应该得到一些日志文件,让我们检查一下。

    tensorboard --logdir=logs/

    根据您的操作系统,您需要导航到的内容可能会有所不同。在linux上,无论它告诉你什么(应该在控制台输出中给你一个URL)都足够了,大概127.0.0.1:6006也可以。在windows上,我发现唯一适合我的是localhost:6006。无论你做什么都可以到达那里!无论如何,一旦到了那里,我们就可以搜索匹配以下正则表达式的标记:\w(任何字母)并一起查看所有的图。对我来说,我有: 所以,毫不奇怪,我们并没有在100集里突然学会如何成为一名出色的司机(以我们的平均回报来判断),所以我们可能应该取消那些投资会议,去推销下一个价值10亿美元的自动驾驶汽车初创公司。 我忘记添加模型了。在我第一次测试时保存到我的代码中,因此我最终再次运行。这一次,我实际上得到了更好的结果。主要注意损失是如何从爆炸中恢复过来的。 现在,我们只拍了100集。如果我们的其他问题都解决了,我想我们还需要大约10万集才能看到像样的剧集。也就是说,“看到”您的实际代理运行是有帮助的。所以这里有一个快速脚本,只是播放和看到你的模型在行动:

    import random from collections import deque import numpy as np import cv2 import time import tensorflow as tf import keras.backend.tensorflow_backend as backend from keras.models import load_model from tutorial5_code import CarEnv, MEMORY_FRACTION MODEL_PATH = 'models/Xception__-118.00max_-179.10avg_-250.00min__1566603992.model' if __name__ == '__main__': # Memory fraction gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION) backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))) # Load the model model = load_model(MODEL_PATH) # Create environment env = CarEnv() # For agent speed measurements - keeps last 60 frametimes fps_counter = deque(maxlen=60) # Initialize predictions - first prediction takes longer as of initialization that has to be done # It's better to do a first prediction then before we start iterating over episode steps model.predict(np.ones((1, env.im_height, env.im_width, 3))) # Loop over episodes while True: print('Restarting episode') # Reset environment and get initial state current_state = env.reset() env.collision_hist = [] done = False # Loop over steps while True: # For FPS counter step_start = time.time() # Show current frame cv2.imshow(f'Agent - preview', current_state) cv2.waitKey(1) # Predict an action based on current observation space qs = model.predict(np.array(current_state).reshape(-1, *current_state.shape)/255)[0] action = np.argmax(qs) # Step environment (additional flag informs environment to not break an episode by time limit) new_state, reward, done, _ = env.step(action) # Set current step for next loop iteration current_state = new_state # If done - agent crashed, break an episode if done: break # Measure step time, append to a deque, then print mean FPS for last 60 frames, q values and taken action frame_time = time.time() - step_start fps_counter.append(frame_time) print(f'Agent: {len(fps_counter)/sum(fps_counter):>4.1f} FPS | Action: [{qs[0]:>5.2f}, {qs[1]:>5.2f}, {qs[2]:>5.2f}] {action}') # Destroy an actor at end of episode for actor in env.actor_list: actor.destroy()

    重命名导入tutorial5_code无论你叫你RLagent/env/trainer脚本,然后修复MODEL_PATH = '模型/ xception__ - 118.00 - 179.10 max_——avg_ min__1566603992——250.00。你使用的模型,因为你的模型名称将不同于我的。 例如,这是我的代理在行动: https://pythonprogramming.net/static/images/carla/current-car-after-100-episodes.mp4 我需要再次强调,这只是100集。但是,我们可以看到代理已经学会只是做一件事。Agent可能只学习做一件事,因为Q值实际上是静态的(不管输入是什么,模型输出的Q值都是相同的),或者,就像在我们的例子中一样,它们实际上都在变化,只是右转始终是更高的。 另一件我看到的是有时高于转左转直,有时连续高于左拐。所以还是有希望的。我在gtav系列的自动驾驶汽车中学到的一件事是,你可以通过添加一个输出层的重量侥幸逃脱。 例如,在剧本中,你可以通过以下方式修改qs:

    qs = model.predict(np.array(current_state).reshape(-1, *current_state.shape)/255)[0] qs *= [0.975, 1, 0.92] action = np.argmax(qs)

    所以这作为网络的最后一层。再说一遍,这对100集的模式并没有什么帮助。我们还发现,保持奖励为-1和正1可能更好。没有更多的-200。我们发现,这可能会使Q值爆炸,而这之后似乎就会爆炸损失并造成混乱。我们甚至可以做进一步的剪辑。 我们所做的下一个改变实际上是将我们的神经网络简化为一个类似于2-3层的CNN,每个CNN有64-256个特性。现在还不确定,但似乎越简单越好,需要学习的参数越少。对于完全监督式学习,我认为更多的参数可以很好地工作,因为每件事都是“基本事实”。对于强化学习,我认为人工智能很难从愚蠢的洞中挖出来,因为它一开始就试图训练数以百万计的权重。 无论如何,这就是本教程的全部内容。在下一个教程中,我将带给你一个工作模型,并告诉你我是如何做到的。

    Processed: 0.009, SQL: 8