Added new learning method, still needs to be tested

lexeree · lexeree · commit d2e3ce11fb1f · 2022-08-30T21:15:27.000+02:00
diff --git a/normative_supervisor/supervisor/normsys/DDPLTranslator.java b/normative_supervisor/supervisor/normsys/DDPLTranslator.java
@@ -43,13 +43,24 @@ public void init(Environment env, ArrayList<String> actions) {
 	public void update(Environment env, ArrayList<String> possible, Game game) {
 		labelsToFacts(env);
 		generateActionNorms(possible);
-		generateRegulativeRules();
 		generateConstitutiveRules(env);
+		generateRegulativeRules();
 		generateDefeaters();
 		generateHierarchies();
 
 	}
 	
+	
+	public void synth_update(Environment env) {
+		labelsToFacts(env);
+		generateConstitutiveRules(env);
+		generateRegulativeRules();
+		generateDefeaters();
+		generateHierarchies();
+	}
+	
+	
+	
 	public void generateActionNorms(ArrayList<String> actions) {
 			normBase.generateNonConcurrence(actions);
 			//normBase.generateRequired(actions);
@@ -104,7 +115,7 @@ public void labelsToFacts(Environment env) {
 		for(String lab : env.getNegLabels()) {
 			Literal lit = new Literal(lab);
 			Literal nlit = lit.getComplementClone();
-			Rule fact = new Rule(lab, RuleType.DEFEASIBLE);
+			Rule fact = new Rule(lab, RuleType.FACT);
 			try {
 				fact.addHeadLiteral(nlit);
 			} catch (RuleException e) {}
@@ -210,8 +221,17 @@ public void generateRegulativeRules() {
 				}
 				Literal lit = termToLit(n.getPrescription(), true);
 				rule.addHeadLiteral(lit);
-				rule.setMode(obl);
-				
+				if(n.getName().contains("default")) {
+					for(Rule r : rules) {
+						if(r.isConflictRule(rule)) {
+							Superiority sup = new Superiority(r.getLabel(),rule.getLabel());
+							hierarchy.add(sup);
+						}
+					}
+				}
+				else {
+				    rule.setMode(obl);
+				}
 				if(n.getName().contains("concur") || n.getName().contains("req")) {
 					rule.setRuleType(RuleType.DEFEASIBLE);
 					actionRules.add(rule.clone());
diff --git a/ns_lab.jar b/ns_lab.jar
diff --git a/ns_server.jar b/ns_server.jar
diff --git a/pacman/filter.py b/pacman/filter.py
@@ -9,7 +9,6 @@
 class NormativeFilter():
 
     def __init__(self, norms, reason, port=PORT):
-        self.process = None
         self.violations = []
         self.norm_base = norms
         self.reasoner = reason
@@ -55,8 +54,16 @@ def process_message(self, message):
             if self.compliant:
                 return 0
             else:
-                #score = message['score']
                 return -1
+        elif message['response'] == 'DUAL-EVALUATION':
+            subideal = message['sub-ideal']
+            if self.compliant:
+                return 0,0
+            else:
+                if subideal:
+                    return -1,0
+                else:
+                    return -1,-1
 
 
     def build_query(self, state, actions, rt):
@@ -68,6 +75,8 @@ def build_query(self, state, actions, rt):
             to['possible'] = actions
         elif rt == 'EVALUATION':
             to['action'] = actions[0]
+        elif rt == 'DUAL-EVALUATION':
+            to['action'] = actions[0]
         return to
 
 
diff --git a/pacman/game.py b/pacman/game.py
@@ -522,7 +522,7 @@ class Game:
     """
 
     def __init__( self, agents, display, rules, startingIndex=0, muteAgents=False, catchExceptions=False, filter=None,
-                  train=False, supervise=False , learn=False):
+                  train=False, supervise=False , learn1=False, learn2=False):
         self.agentCrashed = False
         self.agents = agents
         self.display = display
@@ -540,7 +540,8 @@ def __init__( self, agents, display, rules, startingIndex=0, muteAgents=False, c
         self.filter = filter
         self.train = train
         self.supervise = supervise
-        self.learn = learn
+        self.learn1 = learn1
+        self.learn2 = learn2
 
     def getProgress(self):
         if self.gameOver:
@@ -645,7 +646,7 @@ def run( self ):
                         self.unmute()
                         return
                 else:
-                    observation = agent.observationFunction(self.state.deepCopy(), self.filter, self.learn)
+                    observation = agent.observationFunction(self.state.deepCopy(), self.filter, self.learn1, self.learn2)
                 self.unmute()
             else:
                 observation = self.state.deepCopy()
diff --git a/pacman/layouts/mediumOpen.lay b/pacman/layouts/mediumOpen.lay
@@ -0,0 +1,7 @@
+%%%%%%%%%
+%G.....o%
+%.......%
+%.......%
+%.......%
+%G.....P%
+%%%%%%%%%
diff --git a/pacman/learningAgents.py b/pacman/learningAgents.py
@@ -121,7 +121,7 @@ def getLegalActions(self,state, filter=None, train=False, supervise=False):
         """
         return self.actionFn(state, filter, train, supervise)
 
-    def observeTransition(self, state,action,nextState,deltaReward, filter=None, learn=False):
+    def observeTransition(self, state,action,nextState,deltaReward, filter=None, learn1=False, learn2=False):
         """
             Called by environment to inform agent that a transition has
             been observed. This will result in a call to self.update
@@ -131,11 +131,13 @@ def observeTransition(self, state,action,nextState,deltaReward, filter=None, lea
         """
         self.episodeRewards += deltaReward
         self.update(state,action,nextState,deltaReward)
-        if learn and filter is not None:
+        if learn1 and filter is not None:
             result = filter.process_message(filter.send_request(filter.build_query(state.data, [action], 'EVALUATION')))
-            #if result != 0:
-            #    print(action+": "+str(result))
-            self.update2(state, action, nextState,result)
+            self.update2(state, action, nextState, result)
+        if learn2 and filter is not None:
+            result1, result2 = filter.process_message(filter.send_request(filter.build_query(state.data, [action], 'DUAL-EVALUATION')))
+            self.update2(state, action, nextState, result1)
+            self.update3(state, action, nextState, result2)
 
     def startEpisode(self):
         """
@@ -209,14 +211,14 @@ def doAction(self,state,action):
     ###################
     # Pacman Specific #
     ###################
-    def observationFunction(self, state, filter=None, learn=False):
+    def observationFunction(self, state, filter=None, learn1=False, learn2=False):
         """
             This is where we ended up after our last action.
             The simulation should somehow ensure this is called
         """
         if not self.lastState is None:
             reward = state.getScore() - self.lastState.getScore()
-            self.observeTransition(self.lastState, self.lastAction, state, reward, filter, learn)
+            self.observeTransition(self.lastState, self.lastAction, state, reward, filter, learn1, learn2)
         return state
 
     def registerInitialState(self, state):
diff --git a/pacman/pacman.py b/pacman/pacman.py
@@ -294,12 +294,12 @@ def __init__(self, timeout=30):
         self.timeout = timeout
 
     def newGame( self, layout, pacmanAgent, ghostAgents, display, quiet = False, catchExceptions=False, filter=None,
-                 train=False, supervise=False, learn=False):
+                 train=False, supervise=False, learn1=False, learn2=False):
         agents = [pacmanAgent] + ghostAgents[:layout.getNumGhosts()]
         initState = GameState()
         initState.initialize( layout, len(ghostAgents) )
         game = Game(agents, display, self, catchExceptions=catchExceptions, filter=filter, train=train,
-                    supervise=supervise, learn=learn)
+                    supervise=supervise, learn1=learn1, learn2=learn2)
         game.state = initState
         self.initialState = initState.deepCopy()
         self.quiet = quiet
@@ -572,9 +572,10 @@ def readCommand( argv ):
     parser.add_option('--rec', dest='rec', help=default('Would you like to save a record of tests run? Input file name.'),
                       default=None)
     parser.add_option('--supervise', action='store_true', dest='supervise', help='Use normative supervisor?', default=False)
-    parser.add_option('--learn', action='store_true', dest='learn', help='Learn with norms - only choose with MORL agent', default=False)
-    parser.add_option('--partial', action='store_true', dest='partial',
-                      help='Learn with a partial MDP', default=False)
+    parser.add_option('--learn', action='store_true', dest='learn1', help='Learn with norms - only choose with MORL agent', default=False)
+    parser.add_option('--sublearn', action='store_true', dest='learn2',
+                      help='Learn with sub ideal reward function; only select for SubIdealAgent', default=False)
+    parser.add_option('--partial', action='store_true', dest='partial', help='Learn with a partial MDP', default=False)
     #parser.add_option('--punish', type='int', dest='punish', help=default('Punishment for violation of norm base.'), default=0)
     parser.add_option('--port', type='int', dest='port', help=default('Port number.'), default=6666)
     #parser.add_option('--track', action='store_true', dest='track', default=False)
@@ -629,7 +630,8 @@ def readCommand( argv ):
     args['reason'] = options.reason
     args['rec'] = options.rec
     args['supervise'] = options.supervise
-    args['learn'] = options.learn
+    args['learn1'] = options.learn1
+    args['learn2'] = options.learn2
     args['partial'] = options.partial
     args['port'] = options.port
     #if options.track:
@@ -690,7 +692,7 @@ def replayGame( layout, actions, display ):
     display.finish()
 
 def runGames( layout, pacman, ghosts, display, numGames, record, numTraining = 0, catchExceptions=False, timeout=30,
-              norm=None, reason=None, rec=None, supervise=False, learn=False, partial=False, port=6666):
+              norm=None, reason=None, rec=None, supervise=False, learn1=False, learn2=False, partial=False, port=6666):
     import __main__
     __main__.__dict__['_display'] = display
 
@@ -724,7 +726,7 @@ def runGames( layout, pacman, ghosts, display, numGames, record, numTraining = 0
             rules.quiet = False
             train = False
             sup = supervise
-        game = rules.newGame( layout, pacman, ghosts, gameDisplay, beQuiet, catchExceptions, filt, train, sup, learn)
+        game = rules.newGame( layout, pacman, ghosts, gameDisplay, beQuiet, catchExceptions, filt, train, sup, learn1,learn2)
         game.run()
         if not beQuiet: games.append(game)
 
diff --git a/pacman/subIdealAgents.py b/pacman/subIdealAgents.py
@@ -0,0 +1,170 @@
+#EMERY: Thresholded lexicographic multi objective RL agents
+
+
+from game import *
+from learningAgents import ReinforcementAgent
+from featureExtractors import *
+
+import random,util,math
+
+class SubIdealAgent(ReinforcementAgent):
+    def __init__(self, **args):
+        ReinforcementAgent.__init__(self, **args)
+        self.QValues1 = util.Counter()
+        self.QValues2 = util.Counter()
+        self.QValues3 = util.Counter()
+        self.legalActions = []
+
+    def getQValue1(self, state, action):
+        return self.QValues1[(state, action)]
+
+    def getQValue2(self, state, action):
+        return min(0, self.QValues2[(state, action)])
+
+    def getQValue3(self, state, action):
+        return min(0, self.QValues3[(state, action)])
+
+
+    def getAction(self, state, filter=None, train=False, supervise=False):
+        # Pick Action
+        self.legalActions = self.getLegalActions(state, filter, train, supervise)
+        action = None
+        if not self.legalActions:
+          return action
+        randomize = util.flipCoin(self.epsilon)
+        if randomize:
+          action = random.choice(self.legalActions)
+        else:
+          action = self.getPolicy(state)
+        return action
+
+    def update(self, state, action, nextState, reward):
+        curQ = self.QValues1[(state, action)]
+        self.QValues1[(state, action)] = (1 - self.alpha) * curQ + self.alpha * (
+                    reward + self.discount * self.getValue1(nextState))
+
+    def update2(self, state, action, nextState, reward):
+        curQ = self.QValues2[(state, action)]
+        self.QValues2[(state, action)] = (1 - self.alpha) * curQ + self.alpha * (
+                    reward + self.discount * self.getValue2(nextState))
+
+    def update3(self, state, action, nextState, reward):
+        curQ = self.QValues3[(state, action)]
+        self.QValues3[(state, action)] = (1 - self.alpha) * curQ + self.alpha * (
+                    reward + self.discount * self.getValue3(nextState))
+
+    def getPolicy(self, state):
+        actions1 = []
+        actions2 = []
+        if not self.legalActions:
+            return None
+        val2 = self.getValue2(state)
+        for action in self.legalActions:
+            if val2 == self.getQValue2(state, action):
+                actions1.append(action)
+        qvals1 = [self.getQValue3(state, act) for act in actions1]
+        val3 = max(qvals1)
+        for action in actions1:
+            if val3 == self.getQValue3(state, action):
+                actions2.append(action)
+        qvals2 = [self.getQValue1(state, act) for act in actions2]
+        val1 = max(qvals2)
+        actions3 = []
+        for a in actions2:
+            if val1 == self.getQValue1(state, a):
+                actions3.append(a)
+        return random.choice(actions3)
+
+    def getValue1(self, state, filter=None, train=False):
+        qvals = [self.getQValue1(state, action) for action in self.legalActions]
+        if not qvals:
+            return 0.0
+        return max(qvals)
+
+    def getValue2(self, state, filter=None, train=False):
+        qvals = [self.getQValue2(state, action) for action in self.legalActions]
+        if not qvals:
+            return 0.0
+        return max(qvals)
+
+    def getValue3(self, state, filter=None, train=False):
+        qvals = [self.getQValue3(state, action) for action in self.legalActions]
+        if not qvals:
+            return 0.0
+        return max(qvals)
+
+class PacmanSubIdealAgent(SubIdealAgent):
+    def __init__(self, epsilon=0.05,gamma=0.8,alpha=0.2, numTraining=0, **args):
+        args['epsilon'] = epsilon
+        args['gamma'] = gamma
+        args['alpha'] = alpha
+        args['numTraining'] = numTraining
+        self.index = 0  # This is always Pacman
+        SubIdealAgent.__init__(self, **args)
+
+
+    def getAction(self, state, filter=None, train=False, supervise=False):
+        action = SubIdealAgent.getAction(self,state, filter, train, supervise)
+        self.doAction(state,action)
+        return action
+
+
+class ApproximateSubIdealAgent(PacmanSubIdealAgent):
+    def __init__(self, extractor='IdentityExtractor', **args):
+        self.featExtractor = util.lookup(extractor, globals())()
+        PacmanSubIdealAgent.__init__(self, **args)
+        self.weights1 = util.Counter()
+        self.weights2 = util.Counter()
+        self.weights3 = util.Counter()
+
+    def getWeights(self):
+        return self.weights1, self.weights2, self.weights3
+
+    def getQValue1(self, state, action):
+        qval1 = 0.0
+        features = self.featExtractor.getFeatures(state, action)
+        for feature in features:
+          qval1 += features[feature] * self.weights1[feature]
+        return qval1
+
+    def getQValue2(self, state, action):
+        qval2 = 0.0
+        features = self.featExtractor.getFeatures(state, action)
+        for feature in features:
+          qval2 += features[feature] * self.weights2[feature]
+        return min(-0.1, qval2)
+
+    def getQValue3(self, state, action):
+        qval3 = 0.0
+        features = self.featExtractor.getFeatures(state, action)
+        for feature in features:
+          qval3 += features[feature] * self.weights3[feature]
+        return min(-0.1, qval3)
+
+    def update(self, state, action, nextState, reward):
+        features = self.featExtractor.getFeatures(state, action)
+        difference = reward + self.discount * self.getValue1(nextState) - self.getQValue1(state, action)
+        for feature in features:
+          self.weights1[feature] += self.alpha * difference * features[feature]
+
+    def update2(self, state, action, nextState, reward):
+        features = self.featExtractor.getFeatures(state, action)
+        difference = reward + self.discount * self.getValue2(nextState) - self.getQValue2(state, action)
+        for feature in features:
+          self.weights2[feature] += self.alpha * difference * features[feature]
+
+    def update3(self, state, action, nextState, reward):
+        features = self.featExtractor.getFeatures(state, action)
+        difference = reward + self.discount * self.getValue3(nextState) - self.getQValue3(state, action)
+        for feature in features:
+          self.weights3[feature] += self.alpha * difference * features[feature]
+
+    def final(self, state):
+        # call the super-class final method
+        PacmanSubIdealAgent.final(self, state)
+
+        # did we finish training?
+        if self.episodesSoFar == self.numTraining:
+            # you might want to print your weights here for debugging
+            #print('self.weights',self.weights)
+            pass
diff --git a/run.sh b/run.sh