Q-Learning分析

146 阅读 0 评论 97 点赞

我是靠谱客的博主光亮未来，这篇文章主要介绍Q-Learning分析，现在分享给大家，希望可以做个参考。

Q-Learning算法详解：

参考博文：1、http://blog.csdn.NET/pi9nc/article/details/27649323

2、http://mnemstudio.org/path-finding-q-learning-example-1.htm

JAVA代码

复制代码

import java.util.Random;
public class QLearning1
{
    private static final int Q_SIZE = 6;//状态数
    private static final double GAMMA = 0.8;//学习参数
    private static final int ITERATIONS = 10;//迭代次数
    private static final int INITIAL_STATES[] = new int[] {1, 3, 5, 2, 4, 0};//随机初始状态
    private static final int R[][] = new int[][] {{-1, -1, -1, -1, 0, -1}, //reward矩阵，-1表示相应节点之间没有相连
                                                  {-1, -1, -1, 0, -1, 100}, 
                                                  {-1, -1, -1, 0, -1, -1}, 
                                                  {-1, 0, 0, -1, 0, -1}, 
                                                  {0, -1, -1, 0, -1, 100}, 
                                                  {-1, 0, -1, -1, 0, 100}};
    private static int q[][] = new int[Q_SIZE][Q_SIZE];//用来表示已经学到的知识
    private static int currentState = 0;//当前状态序号
    
    private static void train()
    {
        initialize();//初始化Q矩阵 --》零矩阵
        // Perform training, starting at all initial states.
        for(int j = 0; j < ITERATIONS; j++)
        {
            for(int i = 0; i < Q_SIZE; i++)
            {
                episode(INITIAL_STATES[i]);//迭代10次，每一次对将一个状态作为初始状态
            } // i
        } // j
        System.out.println("Q Matrix values:");
        for(int i = 0; i < Q_SIZE; i++)
        {
            for(int j = 0; j < Q_SIZE; j++)
            {
                System.out.print(q[i][j] + ",t");
            } // j
            System.out.print("n");
        } // i
        System.out.print("n");
        return;
    }
    
    private static void test()
    {
        // Perform tests, starting at all initial states.
        System.out.println("Shortest routes from initial states:");
        for(int i = 0; i < Q_SIZE; i++)
        {
            currentState = INITIAL_STATES[i];
            int newState = 0;
            do
            {
                newState = maximum(currentState, true);//最大值的位置
                System.out.print(currentState + ", ");
                currentState = newState;
            }while(currentState < 5);
            System.out.print("5n");
        }
        return;
    }
    
    private static void episode(final int initialState)//每一个training session
    {
        currentState = initialState;
        // Travel from state to state until goal state is reached.
        do
        {
            chooseAnAction();
        }while(currentState == 5);
        // When currentState = 5, Run through the set once more for convergence.
        for(int i = 0; i < Q_SIZE; i++)
        {
            chooseAnAction();
        }
        return;
    }
    
    private static void chooseAnAction()
    {
        int possibleAction = 0;
        // Randomly choose a possible action connected to the current state.
        possibleAction = getRandomAction(Q_SIZE);
        if(R[currentState][possibleAction] >= 0){
            q[currentState][possibleAction] = reward(possibleAction);
            currentState = possibleAction;
        }
        return;
    }
    
    private static int getRandomAction(final int upperBound)
    {
        int action = 0;
        boolean choiceIsValid = false;
        // Randomly choose a possible action connected to the current state.
        while(choiceIsValid == false)
        {
            // Get a random value between 0(inclusive) and 6(exclusive).
            action = new Random().nextInt(upperBound);
            if(R[currentState][action] > -1){
                choiceIsValid = true;
            }
        }
        return action;
    }
    
    private static void initialize()
    {
        for(int i = 0; i < Q_SIZE; i++)
        {
            for(int j = 0; j < Q_SIZE; j++)
            {
                q[i][j] = 0;
            } // j
        } // i
        return;
    }
    
    private static int maximum(final int State, final boolean ReturnIndexOnly)
    {
        // If ReturnIndexOnly = True, the Q matrix index is returned.
        // If ReturnIndexOnly = False, the Q matrix value is returned.
        int winner = 0;
        boolean foundNewWinner = false;
        boolean done = false;
        while(!done)
        {
            foundNewWinner = false;
            for(int i = 0; i < Q_SIZE; i++)
            {
                if(i != winner){             // Avoid self-comparison.
                    if(q[State][i] > q[State][winner]){
                        winner = i;
                        foundNewWinner = true;
                    }
                }
            }
            if(foundNewWinner == false){
                done = true;
            }
        }
        if(ReturnIndexOnly == true){//for test
            return winner;
        }else{//for training
            return q[State][winner];
        }
    }
    
    private static int reward(final int Action)
    {
        return (int)(R[currentState][Action] + (GAMMA * maximum(Action, false)));
    }
    
    public static void main(String[] args)
    {
        train();
        test();
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import java.util.Random;
public class QLearning1
{
    private static final int Q_SIZE = 6;//状态数
    private static final double GAMMA = 0.8;//学习参数
    private static final int ITERATIONS = 10;//迭代次数
    private static final int INITIAL_STATES[] = new int[] {1, 3, 5, 2, 4, 0};//随机初始状态
    private static final int R[][] = new int[][] {{-1, -1, -1, -1, 0, -1}, //reward矩阵，-1表示相应节点之间没有相连
                                                  {-1, -1, -1, 0, -1, 100}, 
                                                  {-1, -1, -1, 0, -1, -1}, 
                                                  {-1, 0, 0, -1, 0, -1}, 
                                                  {0, -1, -1, 0, -1, 100}, 
                                                  {-1, 0, -1, -1, 0, 100}};
    private static int q[][] = new int[Q_SIZE][Q_SIZE];//用来表示已经学到的知识
    private static int currentState = 0;//当前状态序号
    
    private static void train()
    {
        initialize();//初始化Q矩阵 --》零矩阵
        // Perform training, starting at all initial states.
        for(int j = 0; j < ITERATIONS; j++)
        {
            for(int i = 0; i < Q_SIZE; i++)
            {
                episode(INITIAL_STATES[i]);//迭代10次，每一次对将一个状态作为初始状态
            } // i
        } // j
        System.out.println("Q Matrix values:");
        for(int i = 0; i < Q_SIZE; i++)
        {
            for(int j = 0; j < Q_SIZE; j++)
            {
                System.out.print(q[i][j] + ",t");
            } // j
            System.out.print("n");
        } // i
        System.out.print("n");
        return;
    }
    
    private static void test()
    {
        // Perform tests, starting at all initial states.
        System.out.println("Shortest routes from initial states:");
        for(int i = 0; i < Q_SIZE; i++)
        {
            currentState = INITIAL_STATES[i];
            int newState = 0;
            do
            {
                newState = maximum(currentState, true);//最大值的位置
                System.out.print(currentState + ", ");
                currentState = newState;
            }while(currentState < 5);
            System.out.print("5n");
        }
        return;
    }
    
    private static void episode(final int initialState)//每一个training session
    {
        currentState = initialState;
        // Travel from state to state until goal state is reached.
        do
        {
            chooseAnAction();
        }while(currentState == 5);
        // When currentState = 5, Run through the set once more for convergence.
        for(int i = 0; i < Q_SIZE; i++)
        {
            chooseAnAction();
        }
        return;
    }
    
    private static void chooseAnAction()
    {
        int possibleAction = 0;
        // Randomly choose a possible action connected to the current state.
        possibleAction = getRandomAction(Q_SIZE);
        if(R[currentState][possibleAction] >= 0){
            q[currentState][possibleAction] = reward(possibleAction);
            currentState = possibleAction;
        }
        return;
    }
    
    private static int getRandomAction(final int upperBound)
    {
        int action = 0;
        boolean choiceIsValid = false;
        // Randomly choose a possible action connected to the current state.
        while(choiceIsValid == false)
        {
            // Get a random value between 0(inclusive) and 6(exclusive).
            action = new Random().nextInt(upperBound);
            if(R[currentState][action] > -1){
                choiceIsValid = true;
            }
        }
        return action;
    }
    
    private static void initialize()
    {
        for(int i = 0; i < Q_SIZE; i++)
        {
            for(int j = 0; j < Q_SIZE; j++)
            {
                q[i][j] = 0;
            } // j
        } // i
        return;
    }
    
    private static int maximum(final int State, final boolean ReturnIndexOnly)
    {
        // If ReturnIndexOnly = True, the Q matrix index is returned.
        // If ReturnIndexOnly = False, the Q matrix value is returned.
        int winner = 0;
        boolean foundNewWinner = false;
        boolean done = false;
        while(!done)
        {
            foundNewWinner = false;
            for(int i = 0; i < Q_SIZE; i++)
            {
                if(i != winner){             // Avoid self-comparison.
                    if(q[State][i] > q[State][winner]){
                        winner = i;
                        foundNewWinner = true;
                    }
                }
            }
            if(foundNewWinner == false){
                done = true;
            }
        }
        if(ReturnIndexOnly == true){//for test
            return winner;
        }else{//for training
            return q[State][winner];
        }
    }
    
    private static int reward(final int Action)
    {
        return (int)(R[currentState][Action] + (GAMMA * maximum(Action, false)));
    }
    
    public static void main(String[] args)
    {
        train();
        test();
    }
}