diff --git a/Rl.py b/Rl.py index 818af2ebcb3c88a4ece8f16dfb1f4ec40eb1a0a7..5d04b3fff3221ec06dfad4f2d26a473b864dc68b 100644 --- a/Rl.py +++ b/Rl.py @@ -99,115 +99,46 @@ def rewarding(appliable, config, action, missingLinks, funcname): forbiddenReward = 1.5 ################################################################################ -def rewardA(appliable, config, action, missingLinks): - if appliable: - if action.name != "BACK" : - reward = -1.0*action.getOracleScore(config, missingLinks) - else : - back = action.size - error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0] - last_error = error_in_pop[-1] if len(error_in_pop) > 0 else 0 - reward = last_error - back - else: - reward = -forbiddenReward - return reward -################################################################################ - -################################################################################ -def rewardB(appliable, config, action, missingLinks): - if appliable: - if action.name != "BACK" : - reward = 1.0 - action.getOracleScore(config, missingLinks) - else : - back = action.size - error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0] - last_error = error_in_pop[-1] if len(error_in_pop) > 0 else 0 - reward = last_error - back - else: - reward = -forbiddenReward - return reward -################################################################################ - -################################################################################ -def rewardC(appliable, config, action, missingLinks): +def rewardE(appliable, config, action, missingLinks): if appliable: if action.name != "BACK" : reward = -action.getOracleScore(config, missingLinks) else : - back = action.size - error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0] - canceledRewards = [h[3] for h in config.historyPop[-back:]] - reward = -sum(canceledRewards) + reward = 0.5 else: reward = -forbiddenReward return reward ################################################################################ ################################################################################ -def reward3C(appliable, config, action, missingLinks): - if appliable: - if action.name != "BACK" : - reward = -action.getOracleScore(config, missingLinks) - else : - back = action.size - error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0] - canceledRewards = [h[3] for h in config.historyPop[-back:]] - reward = -sum(canceledRewards) - else: - reward = -forbiddenReward - return reward*3.0 -################################################################################ - -################################################################################ -def rewardD(appliable, config, action, missingLinks): +def rewardG(appliable, config, action, missingLinks): if appliable: if action.name != "BACK" : reward = -action.getOracleScore(config, missingLinks) else : back = action.size - error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0] canceledRewards = [h[3] for h in config.historyPop[-back:]] - reward = -sum(canceledRewards) - 1 - else: - reward = -forbiddenReward - return reward -################################################################################ - -################################################################################ -def rewardE(appliable, config, action, missingLinks): - if appliable: - if action.name != "BACK" : - reward = -action.getOracleScore(config, missingLinks) - else : - reward = -0.5 + reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else -1 else: reward = -forbiddenReward return reward ################################################################################ ################################################################################ -def rewardF(appliable, config, action, missingLinks): - if appliable: - if action.name != "BACK" : - reward = -1.0*action.getOracleScore(config, missingLinks) - else : - back = action.size - error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0] - last_error = error_in_pop[-1] if len(error_in_pop) > 0 else 0 - reward = last_error - back - else: - reward = -forbiddenReward - return 10*reward -################################################################################ - -################################################################################ -def rewardG(appliable, config, action, missingLinks): +def rewardA(appliable, config, action, missingLinks): if appliable: if action.name != "BACK" : reward = -action.getOracleScore(config, missingLinks) else : - back = action.size - canceledRewards = [h[3] for h in config.historyPop[-back:]] + canceledRewards = [] + found = 0 + for i in range(len(config.historyPop))[::-1] : + if config.historyPop[i][0].name == "NOBACK" : + found += 1 + if found == action.size : + break + else : + canceledRewards.append(config.historyPop[i][3]) reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else -1 else: reward = -forbiddenReward @@ -215,7 +146,7 @@ def rewardG(appliable, config, action, missingLinks): ################################################################################ ################################################################################ -def rewardA(appliable, config, action, missingLinks): +def rewardA2(appliable, config, action, missingLinks): if appliable: if action.name != "BACK" : reward = -action.getOracleScore(config, missingLinks) @@ -229,7 +160,7 @@ def rewardA(appliable, config, action, missingLinks): break else : canceledRewards.append(config.historyPop[i][3]) - reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else -1 + reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else 0 else: reward = -forbiddenReward return reward