New rewards

1790aa5a · Franck Dary · fbd3d3cc · 1790aa5a
Commit 1790aa5a authored Jul 13, 2021 by Franck Dary
--- a/Rl.py
+++ b/Rl.py
@@ -99,115 +99,46 @@ def rewarding(appliable, config, action, missingLinks, funcname):
 forbiddenReward = 1.5

 ################################################################################
-def rewardA(appliable, config, action, missingLinks):
-  if appliable:
-    if action.name != "BACK" :
-      reward = -1.0*action.getOracleScore(config, missingLinks)
-    else :
-      back = action.size
-      error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0]
-      last_error = error_in_pop[-1] if len(error_in_pop) > 0 else 0
-      reward = last_error - back
-  else:
-    reward = -forbiddenReward
-  return reward
-################################################################################
-
-################################################################################
-def rewardB(appliable, config, action, missingLinks):
-  if appliable:
-    if action.name != "BACK" :
-      reward = 1.0 - action.getOracleScore(config, missingLinks)
-    else :
-      back = action.size
-      error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0]
-      last_error = error_in_pop[-1] if len(error_in_pop) > 0 else 0
-      reward = last_error - back
-  else:
-    reward = -forbiddenReward
-  return reward
-################################################################################
-
-################################################################################
-def rewardC(appliable, config, action, missingLinks):
+def rewardE(appliable, config, action, missingLinks):
  if appliable:
    if action.name != "BACK" :
      reward = -action.getOracleScore(config, missingLinks)
    else :
-      back = action.size
-      error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0]
-      canceledRewards = [h[3] for h in config.historyPop[-back:]]
-      reward = -sum(canceledRewards)
+      reward = 0.5
  else:
    reward = -forbiddenReward
  return reward
 ################################################################################

 ################################################################################
-def reward3C(appliable, config, action, missingLinks):
-  if appliable:
-    if action.name != "BACK" :
-      reward = -action.getOracleScore(config, missingLinks)
-    else :
-      back = action.size
-      error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0]
-      canceledRewards = [h[3] for h in config.historyPop[-back:]]
-      reward = -sum(canceledRewards)
-  else:
-    reward = -forbiddenReward
-  return reward*3.0
-################################################################################
-
-################################################################################
-def rewardD(appliable, config, action, missingLinks):
+def rewardG(appliable, config, action, missingLinks):
  if appliable:
    if action.name != "BACK" :
      reward = -action.getOracleScore(config, missingLinks)
    else :
      back = action.size
-      error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0]
      canceledRewards = [h[3] for h in config.historyPop[-back:]]
-      reward = -sum(canceledRewards) - 1
+      reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else -1
  else:
    reward = -forbiddenReward
  return reward
 ################################################################################

 ################################################################################
-def rewardE(appliable, config, action, missingLinks):
+def rewardA(appliable, config, action, missingLinks):
  if appliable:
    if action.name != "BACK" :
      reward = -action.getOracleScore(config, missingLinks)
    else :
-      reward = -0.5
-  else:
-    reward = -forbiddenReward
-  return reward
-################################################################################
-
-################################################################################
-def rewardF(appliable, config, action, missingLinks):
-  if appliable:
-    if action.name != "BACK" :
-      reward = -1.0*action.getOracleScore(config, missingLinks)
-    else :
-      back = action.size
-      error_in_pop = [i for i in range(1,back) if config.historyPop[-i][3] < 0]
-      last_error = error_in_pop[-1] if len(error_in_pop) > 0 else 0
-      reward = last_error - back
-  else:
-    reward = -forbiddenReward
-  return 10*reward
-################################################################################
-
-################################################################################
-def rewardG(appliable, config, action, missingLinks):
-  if appliable:
-    if action.name != "BACK" :
-      reward = -action.getOracleScore(config, missingLinks)
+      canceledRewards = []
+      found = 0
+      for i in range(len(config.historyPop))[::-1] :
+        if config.historyPop[i][0].name == "NOBACK" :
+          found += 1
+          if found == action.size :
+            break
        else :
-      back = action.size
-      canceledRewards = [h[3] for h in config.historyPop[-back:]]
+          canceledRewards.append(config.historyPop[i][3])
      reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else -1
  else:
    reward = -forbiddenReward
@@ -215,7 +146,7 @@ def rewardG(appliable, config, action, missingLinks):
 ################################################################################

 ################################################################################
-def rewardA(appliable, config, action, missingLinks):
+def rewardA2(appliable, config, action, missingLinks):
  if appliable:
    if action.name != "BACK" :
      reward = -action.getOracleScore(config, missingLinks)
@@ -229,7 +160,7 @@ def rewardA(appliable, config, action, missingLinks):
            break
        else :
          canceledRewards.append(config.historyPop[i][3])
-      reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else -1
+      reward = np.log(1-sum(canceledRewards)) if -sum(canceledRewards) > 0 else 0
  else:
    reward = -forbiddenReward
  return reward