elasticsearch-hadoop使用记录

171 阅读 0 评论 113 点赞

我是靠谱客的博主温柔洋葱，这篇文章主要介绍elasticsearch-hadoop使用记录，现在分享给大家，希望可以做个参考。

elasticsearch-hadoop是一个深度集成Hadoop和ElasticSearch的项目，也是ES官方来维护的一个子项目，通过实现Hadoop和ES之间的输入输出，可以在Hadoop里面对ES集群的数据进行读取和写入，充分发挥Map-Reduce并行处理的优势，为Hadoop数据带来实时搜索的可能。
项目网址：http://www.elasticsearch.org/overview/hadoop/

运行环境：
CDH4、ElasticSearch0.90.2

http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH4/latest/CDH4-Quick-Start/cdh4qs_topic_3_3.html

https://github.com/medcl/elasticsearch-rtf

Hive和ES的互操作：
#安装，HIVE里面添加ElasticSearch-Hadoop的JAR路径
#下载hadoop-es jar包,https://download.elasticsearch.org/hadoop/hadoop-latest.zip

#Hive加载的JAR路径为本地路径

复制代码

1
2
3
4
5
6
7
[medcl@node-1 ~]$ ls
elasticsearch-hadoop-1.3.0.M1.jar
[medcl@node-1 ~]$ pwd
/home/medcl
[medcl@node-1 ~]$ hive -hiveconf hive.aux.jars.path=/home/medcl/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/medcl/hive_job_log_94db3616-e210-4aab-b07b-6fb159e217ec_1758848920.txt

#ElasticSearch集群名为"elasticsearch",和Hadoop在一个机器上

#Hive里面创建一个Table(user)，并使用Hadoop-ElasticSearch关联一个索引（/index/user）,2个字段，id和name

复制代码

1
2
3
4
CREATE EXTERNAL TABLE user  (id INT, name STRING,site STRING)
STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
TBLPROPERTIES('es.resource' = 'index/user/',
              'es.index.auto.create' = 'true')

复制代码

在medcl用下操作：
CREATE EXTERNAL TABLE user  (id INT, name STRING)
STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
TBLPROPERTIES('es.resource' = '/index/user/',
              'es.index.auto.create' = 'true');
 
 
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask
hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
    > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    > TBLPROPERTIES('es.resource' = 'medcl/',
    >               'es.index.auto.create' = 'false');
FAILED: Error in metadata: MetaException(message:Got exception: org.apache.hadoop.security.AccessControlException Permission denied: user=medcl, access=WRITE, inode="/user":hdfs:supergroup:drwxr-xr-x
 
#擦，看下权限
[medcl@node-1 ~]$ hadoop fs -lsr /
lsr: DEPRECATED: Please use 'ls -R' instead.
drwxrwxrwt   - hdfs supergroup          0 2013-12-16 22:19 /tmp
drwxr-xr-x   - hdfs supergroup          0 2013-12-16 22:25 /user
drwxr-xr-x   - medcl supergroup          0 2013-12-17 00:30 /user/medcl
drwxr-xr-x   - medcl supergroup          0 2013-12-16 22:32 /user/medcl/input
-rw-r--r--   1 medcl supergroup    2801897 2013-12-16 22:32 /user/medcl/input/file1.txt
drwxr-xr-x   - medcl supergroup          0 2013-12-17 00:30 /user/medcl/lib
-rw-r--r--   1 medcl supergroup     160414 2013-12-17 00:30 /user/medcl/lib/elasticsearch-hadoop-1.3.0.M1.jar
drwxr-xr-x   - hdfs  supergroup          0 2013-12-16 22:20 /var
drwxr-xr-x   - hdfs  supergroup          0 2013-12-16 22:20 /var/lib
#原来user目录权限是hdfs，ok，切换hdfs，jar也换个hdfs用户可以访问到的位置，就/tmp吧
[root@node-1 medcl]# cp elasticsearch-hadoop-1.3.0.M1.jar  /tmp/
[root@node-1 medcl]# ^C
[root@node-1 medcl]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_bdad4d7a-f929-43d7-a56e-e026fdd7e3b4_1219802521.txt
hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
    > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    > TBLPROPERTIES('es.resource' = '/index/user/',
    >               'es.index.auto.create' = 'false');
2013-12-16 17:09:29.560 GMT Thread[main,5,main] java.io.FileNotFoundException: derby.log (Permission denied)
----------------------------------------------------------------
2013-12-16 17:09:29.877 GMT:
 Booting Derby version The Apache Software Foundation - Apache Derby - 10.4.2.0 - (689064): instance a816c00e-0142-fc62-4b5c-000000cec758
on database directory /var/lib/hive/metastore/metastore_db in READ ONLY mode 
 
Database Class Loader started - derby.database.classpath=''
FAILED: Error in metadata: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.metastore.HiveMetaStoreClient
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask
 
#ok，干掉lock
[root@node-1 ~]# ls /var/lib/hive/metastore/metastore_db
dbex.lck  db.lck  log  seg0  service.properties  tmp
[root@node-1 ~]# rm /var/lib/hive/metastore/metastore_db/dbex.lck 
rm: remove regular file `/var/lib/hive/metastore/metastore_db/dbex.lck'? y
[root@node-1 ~]# rm /var/lib/hive/metastore/metastore_db/db.lck 
rm: remove regular file `/var/lib/hive/metastore/metastore_db/db.lck'? y
 
#另外忘记关另外一个hive实例了，难怪呢。
[root@node-1 tmp]# ps -aux|grep hive
Warning: bad syntax, perhaps a bogus '-'? See /usr/share/doc/procps-3.2.8/FAQ
root     10855  0.0  0.1 148024  2064 pts/0    S+   01:09   0:00 sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
hdfs     10856  1.8  5.7 858344 109892 pts/0   Sl+  01:09   0:06 /usr/lib/jvm/java-openjdk/bin/java -Xmx256m -Dhadoop.log.dir=/usr/lib/hadoop/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/usr/lib/hadoop -Dhadoop.id.str= -Dhadoop.root.logger=INFO,console -Djava.library.path=/usr/lib/hadoop/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,NullAppender org.apache.hadoop.util.RunJar /usr/lib/hive/lib/hive-cli-0.10.0-cdh4.5.0.jar org.apache.hadoop.hive.cli.CliDriver -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
 
 
#权限问题
[root@node-1 tmp]# ll /var/lib/hive/metastore/metastore_db/
total 16
drwxrwxr-x 2 medcl medcl 4096 Dec 17 00:56 log
drwxrwxr-x 2 medcl medcl 4096 Dec 17 00:56 seg0
-rw-rw-r-- 1 medcl medcl  860 Dec 17 00:56 service.properties
drwxrwxr-x 2 medcl medcl 4096 Dec 17 01:01 tmp
[root@node-1 tmp]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar^C
[root@node-1 tmp]# chmod 777 /var/lib/hive/metastore/metastore_db/ -R
[root@node-1 tmp]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_d5749cb0-fde0-4da2-9094-c85cf4673885_252074310.txt
hive> show tables;
OK
Time taken: 6.934 seconds
hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
    > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    > TBLPROPERTIES('es.resource' = '/index/user/',
    >               'es.index.auto.create' = 'true');
OK
Time taken: 1.115 seconds
 
#ok，创建成功了
hive> show tables;
OK
user
Time taken: 0.15 seconds
hive> 
 
#权限问题是Hive默认仓库路径造成的，生疏了
[root@node-1 tmp]# sudo su hdfs
bash-4.1$ hadoop fs -lsr /
lsr: DEPRECATED: Please use 'ls -R' instead.
drwxrwxrwt   - hdfs supergroup          0 2013-12-16 22:19 /tmp
drwxr-xr-x   - hdfs supergroup          0 2013-12-17 01:20 /user
drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive
drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive/warehouse
drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive/warehouse/user
 
#好了，开始往HIVE里面倒数据了，先来几行数据
[root@node-1 tmp]# cat files1.txt 
1,medcl
2,lcdem
3,tom
4,jack
 
#传上去
[root@node-1 tmp]# sudo su hdfs
bash-4.1$ hadoop fs -put files1.txt /tmp/
bash-4.1$ hadoop fs -ls /tmp/
Found 1 items
-rw-r--r--   1 hdfs supergroup         29 2013-12-17 01:28 /tmp/files1.txt
 
#加载到Hive里面
hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
#LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user_source; 
#CREATE EXTERNAL TABLE user_source  (id INT, name STRING);
 
#不是原始Hive表，还不能直接LOAD
bash-4.1$ hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_a9516f87-6e2d-44db-9d38-18eed77d9dec_1583221137.txt
hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user; 
FAILED: SemanticException [Error 10101]: A non-native table cannot be used as target for LOAD
hive> CREATE EXTERNAL TABLE user_source  (id INT, name STRING);
OK
Time taken: 1.104 seconds
hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user_source; 
Copying data from file:/tmp/files1.txt
Copying file: file:/tmp/files1.txt
Loading data to table default.user_source
Table default.user_source stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 29, raw_data_size: 0]
OK
Time taken: 0.911 seconds
hive> show tables;
OK
user
user_source
Time taken: 0.226 seconds
 
#下面这个错误是因为es-hadoop的jar文件没有传到HDFS上面，看来本地和HDFS都要上传，并且路径要一致
hive> select id,name from  user_source;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
java.io.FileNotFoundException: File does not exist: /tmp/elasticsearch-hadoop-1.3.0.M1.jar
  at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:824)
  at org.apache.hadoop.filecache.DistributedCache.getFileStatus(DistributedCache.java:185)
  at org.apache.hadoop.filecache.TrackerDistributedCacheManager.determineTimestamps(TrackerDistributedCacheManager.java:821)
  at org.apache.hadoop.filecache.TrackerDistributedCacheManager.determineTimestampsAndCacheVisibilities(TrackerDistributedCacheManager.java:778)
  at org.apache.hadoop.mapred.JobClient.copyAndConfigureFiles(JobClient.java:855)
  at org.apache.hadoop.mapred.JobClient.copyAndConfigureFiles(JobClient.java:746)
  at org.apache.hadoop.mapred.JobClient.access$400(JobClient.java:177)
  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:963)
  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:948)
  at java.security.AccessController.doPrivileged(Native Method)
  at javax.security.auth.Subject.doAs(Subject.java:415)
  at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408)
  at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:948)
  at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:922)
  at org.apache.hadoop.hive.ql.exec.ExecDriver.execute(ExecDriver.java:448)
  at org.apache.hadoop.hive.ql.exec.MapRedTask.execute(MapRedTask.java:138)
  at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:138)
  at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:66)
  at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1383)
  at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1169)
  at org.apache.hadoop.hive.ql.Driver.run(Driver.java:982)
  at org.apache.hadoop.hive.ql.Driver.run(Driver.java:902)
  at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:259)
  at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:216)
  at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:412)
  at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:759)
  at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:613)
  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
  at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
  at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
  at java.lang.reflect.Method.invoke(Method.java:606)
  at org.apache.hadoop.util.RunJar.main(RunJar.java:208)
Job Submission failed with exception 'java.io.FileNotFoundException(File does not exist: /tmp/elasticsearch-hadoop-1.3.0.M1.jar)'
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MapRedTask
 
 
#ok，再看看
bash-4.1$ hadoop fs -put elasticsearch-hadoop-1.3.0.M1.jar  /tmp/
bash-4.1$ hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_28ea1fbc-dc3b-4e62-9f47-1a88eed30069_1310993479.txt
hive> select id,name from  user_source;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201312162220_0004, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312162220_0004
Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_201312162220_0004
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2013-12-17 01:36:28,086 Stage-1 map = 0%,  reduce = 0%
2013-12-17 01:36:34,141 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:35,162 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:36,177 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:37,184 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:38,204 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 0.88 sec
MapReduce Total cumulative CPU time: 880 msec
Ended Job = job_201312162220_0004
MapReduce Jobs Launched: 
Job 0: Map: 1   Cumulative CPU: 0.88 sec   HDFS Read: 247 HDFS Write: 24 SUCCESS
Total MapReduce CPU Time Spent: 880 msec
OK
NULL	NULL
NULL	NULL
NULL	NULL
NULL	NULL
Time taken: 25.999 seconds
 
#慢，数据怎么是空的，建成外表了(EXTERNAL)，没有设置默认的分隔符,好纠结
hive> drop table user_source;                                                                        
OK
Time taken: 0.649 seconds
hive> CREATE TABLE user_source  (id INT, name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.109 seconds
hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' INTO TABLE user_source;                              
Copying data from file:/tmp/files1.txt
Copying file: file:/tmp/files1.txt
Loading data to table default.user_source
Table default.user_source stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 29, raw_data_size: 0]
OK
Time taken: 0.348 seconds
hive> select * from  user_source;                                                                   
OK
1	medcl
2	lcdem
3	tom
4	jack
Time taken: 0.155 seconds
 
#源表现在有了，导入到ES所在的表里面去
 
hive> INSERT OVERWRITE TABLE user
    >     SELECT s.id, s.name FROM user_source s;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201312162220_0005, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312162220_0005
Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_201312162220_0005
Hadoop job information for Stage-0: number of mappers: 1; number of reducers: 0
2013-12-17 01:50:52,141 Stage-0 map = 0%,  reduce = 0%
2013-12-17 01:51:03,220 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:04,243 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:05,254 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:06,266 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:07,294 Stage-0 map = 100%,  reduce = 100%, Cumulative CPU 1.16 sec
MapReduce Total cumulative CPU time: 1 seconds 160 msec
Ended Job = job_201312162220_0005
4 Rows loaded to user
MapReduce Jobs Launched: 
Job 0: Map: 1   Cumulative CPU: 1.16 sec   HDFS Read: 247 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 160 msec
OK
Time taken: 21.849 seconds
hive> select * from user;
OK
Failed with exception java.io.IOException:java.lang.IllegalStateException: [GET] on [/index/user/&search_type=scan&scroll=10m&size=50&preference=_shards:4;_only_node:MP7Zl3owTRm8O2V6cWvOSg] failed; server[http://10.0.2.15:9200] returned [{"_index":"index","_type":"user","_id":"&search_type=scan&scroll=10m&size=50&preference=_shards:4;_only_node:MP7Zl3owTRm8O2V6cWvOSg","exists":false}]
Time taken: 0.387 seconds
 
#可以看出来hadoop-elasticsearch翻译出来的查询语句好像有问题！不过elasticsearch里面已经有数据了，反正暂时不需要用hive来执行查询，先官方发个issue吧。
 
#ES查询结果
bash-4.1$ curl localhost:9200/index/user/_search?q=*&pretty=true
[1] 13588
bash-4.1$ {"took":3,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":4,"max_score":1.0,"hits":[{"_index":"index","_type":"user","_id":"3x4bEcriRvS6AHkX2Sb7UA","_score":1.0, "_source" : {"id":2,"name":"lcdem"}},{"_index":"index","_type":"user","_id":"_3rGVWhaTSCixYxRzBUSLQ","_score":1.0, "_source" : {"id":4,"name":"jack"}},{"_index":"index","_type":"user","_id":"T-Q_icjgR8ehsH3IV-twWw","_score":1.0, "_source" : {"id":1,"name":"medcl"}},{"_index":"index","_type":"user","_id":"Vdz0sryBT5u0e9hfoMY8Tg","_score":1.0, "_source" : {"id":3,"name":"tom"}}]}}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
在medcl用下操作：
CREATE EXTERNAL TABLE user  (id INT, name STRING)
STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
TBLPROPERTIES('es.resource' = '/index/user/',
              'es.index.auto.create' = 'true');
 
 
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask
hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
    > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    > TBLPROPERTIES('es.resource' = 'medcl/',
    >               'es.index.auto.create' = 'false');
FAILED: Error in metadata: MetaException(message:Got exception: org.apache.hadoop.security.AccessControlException Permission denied: user=medcl, access=WRITE, inode="/user":hdfs:supergroup:drwxr-xr-x
 
#擦，看下权限
[medcl@node-1 ~]$ hadoop fs -lsr /
lsr: DEPRECATED: Please use 'ls -R' instead.
drwxrwxrwt   - hdfs supergroup          0 2013-12-16 22:19 /tmp
drwxr-xr-x   - hdfs supergroup          0 2013-12-16 22:25 /user
drwxr-xr-x   - medcl supergroup          0 2013-12-17 00:30 /user/medcl
drwxr-xr-x   - medcl supergroup          0 2013-12-16 22:32 /user/medcl/input
-rw-r--r--   1 medcl supergroup    2801897 2013-12-16 22:32 /user/medcl/input/file1.txt
drwxr-xr-x   - medcl supergroup          0 2013-12-17 00:30 /user/medcl/lib
-rw-r--r--   1 medcl supergroup     160414 2013-12-17 00:30 /user/medcl/lib/elasticsearch-hadoop-1.3.0.M1.jar
drwxr-xr-x   - hdfs  supergroup          0 2013-12-16 22:20 /var
drwxr-xr-x   - hdfs  supergroup          0 2013-12-16 22:20 /var/lib
#原来user目录权限是hdfs，ok，切换hdfs，jar也换个hdfs用户可以访问到的位置，就/tmp吧
[root@node-1 medcl]# cp elasticsearch-hadoop-1.3.0.M1.jar  /tmp/
[root@node-1 medcl]# ^C
[root@node-1 medcl]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_bdad4d7a-f929-43d7-a56e-e026fdd7e3b4_1219802521.txt
hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
    > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    > TBLPROPERTIES('es.resource' = '/index/user/',
    >               'es.index.auto.create' = 'false');
2013-12-16 17:09:29.560 GMT Thread[main,5,main] java.io.FileNotFoundException: derby.log (Permission denied)
----------------------------------------------------------------
2013-12-16 17:09:29.877 GMT:
 Booting Derby version The Apache Software Foundation - Apache Derby - 10.4.2.0 - (689064): instance a816c00e-0142-fc62-4b5c-000000cec758
on database directory /var/lib/hive/metastore/metastore_db in READ ONLY mode 
 
Database Class Loader started - derby.database.classpath=''
FAILED: Error in metadata: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.metastore.HiveMetaStoreClient
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask
 
#ok，干掉lock
[root@node-1 ~]# ls /var/lib/hive/metastore/metastore_db
dbex.lck  db.lck  log  seg0  service.properties  tmp
[root@node-1 ~]# rm /var/lib/hive/metastore/metastore_db/dbex.lck 
rm: remove regular file `/var/lib/hive/metastore/metastore_db/dbex.lck'? y
[root@node-1 ~]# rm /var/lib/hive/metastore/metastore_db/db.lck 
rm: remove regular file `/var/lib/hive/metastore/metastore_db/db.lck'? y
 
#另外忘记关另外一个hive实例了，难怪呢。
[root@node-1 tmp]# ps -aux|grep hive
Warning: bad syntax, perhaps a bogus '-'? See /usr/share/doc/procps-3.2.8/FAQ
root     10855  0.0  0.1 148024  2064 pts/0    S+   01:09   0:00 sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
hdfs     10856  1.8  5.7 858344 109892 pts/0   Sl+  01:09   0:06 /usr/lib/jvm/java-openjdk/bin/java -Xmx256m -Dhadoop.log.dir=/usr/lib/hadoop/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/usr/lib/hadoop -Dhadoop.id.str= -Dhadoop.root.logger=INFO,console -Djava.library.path=/usr/lib/hadoop/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,NullAppender org.apache.hadoop.util.RunJar /usr/lib/hive/lib/hive-cli-0.10.0-cdh4.5.0.jar org.apache.hadoop.hive.cli.CliDriver -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
 
 
#权限问题
[root@node-1 tmp]# ll /var/lib/hive/metastore/metastore_db/
total 16
drwxrwxr-x 2 medcl medcl 4096 Dec 17 00:56 log
drwxrwxr-x 2 medcl medcl 4096 Dec 17 00:56 seg0
-rw-rw-r-- 1 medcl medcl  860 Dec 17 00:56 service.properties
drwxrwxr-x 2 medcl medcl 4096 Dec 17 01:01 tmp
[root@node-1 tmp]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar^C
[root@node-1 tmp]# chmod 777 /var/lib/hive/metastore/metastore_db/ -R
[root@node-1 tmp]# sudo -u hdfs hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_d5749cb0-fde0-4da2-9094-c85cf4673885_252074310.txt
hive> show tables;
OK
Time taken: 6.934 seconds
hive> CREATE EXTERNAL TABLE user  (id INT, name STRING)
    > STORED BY 'org.elasticsearch.hadoop.hive.ESStorageHandler'
    > TBLPROPERTIES('es.resource' = '/index/user/',
    >               'es.index.auto.create' = 'true');
OK
Time taken: 1.115 seconds
 
#ok，创建成功了
hive> show tables;
OK
user
Time taken: 0.15 seconds
hive> 
 
#权限问题是Hive默认仓库路径造成的，生疏了
[root@node-1 tmp]# sudo su hdfs
bash-4.1$ hadoop fs -lsr /
lsr: DEPRECATED: Please use 'ls -R' instead.
drwxrwxrwt   - hdfs supergroup          0 2013-12-16 22:19 /tmp
drwxr-xr-x   - hdfs supergroup          0 2013-12-17 01:20 /user
drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive
drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive/warehouse
drwxr-xr-x   - hdfs  supergroup          0 2013-12-17 01:20 /user/hive/warehouse/user
 
#好了，开始往HIVE里面倒数据了，先来几行数据
[root@node-1 tmp]# cat files1.txt 
1,medcl
2,lcdem
3,tom
4,jack
 
#传上去
[root@node-1 tmp]# sudo su hdfs
bash-4.1$ hadoop fs -put files1.txt /tmp/
bash-4.1$ hadoop fs -ls /tmp/
Found 1 items
-rw-r--r--   1 hdfs supergroup         29 2013-12-17 01:28 /tmp/files1.txt
 
#加载到Hive里面
hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
#LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user_source; 
#CREATE EXTERNAL TABLE user_source  (id INT, name STRING);
 
#不是原始Hive表，还不能直接LOAD
bash-4.1$ hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_a9516f87-6e2d-44db-9d38-18eed77d9dec_1583221137.txt
hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user; 
FAILED: SemanticException [Error 10101]: A non-native table cannot be used as target for LOAD
hive> CREATE EXTERNAL TABLE user_source  (id INT, name STRING);
OK
Time taken: 1.104 seconds
hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' OVERWRITE INTO TABLE user_source; 
Copying data from file:/tmp/files1.txt
Copying file: file:/tmp/files1.txt
Loading data to table default.user_source
Table default.user_source stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 29, raw_data_size: 0]
OK
Time taken: 0.911 seconds
hive> show tables;
OK
user
user_source
Time taken: 0.226 seconds
 
#下面这个错误是因为es-hadoop的jar文件没有传到HDFS上面，看来本地和HDFS都要上传，并且路径要一致
hive> select id,name from  user_source;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
java.io.FileNotFoundException: File does not exist: /tmp/elasticsearch-hadoop-1.3.0.M1.jar
  at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:824)
  at org.apache.hadoop.filecache.DistributedCache.getFileStatus(DistributedCache.java:185)
  at org.apache.hadoop.filecache.TrackerDistributedCacheManager.determineTimestamps(TrackerDistributedCacheManager.java:821)
  at org.apache.hadoop.filecache.TrackerDistributedCacheManager.determineTimestampsAndCacheVisibilities(TrackerDistributedCacheManager.java:778)
  at org.apache.hadoop.mapred.JobClient.copyAndConfigureFiles(JobClient.java:855)
  at org.apache.hadoop.mapred.JobClient.copyAndConfigureFiles(JobClient.java:746)
  at org.apache.hadoop.mapred.JobClient.access$400(JobClient.java:177)
  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:963)
  at org.apache.hadoop.mapred.JobClient$2.run(JobClient.java:948)
  at java.security.AccessController.doPrivileged(Native Method)
  at javax.security.auth.Subject.doAs(Subject.java:415)
  at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1408)
  at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:948)
  at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:922)
  at org.apache.hadoop.hive.ql.exec.ExecDriver.execute(ExecDriver.java:448)
  at org.apache.hadoop.hive.ql.exec.MapRedTask.execute(MapRedTask.java:138)
  at org.apache.hadoop.hive.ql.exec.Task.executeTask(Task.java:138)
  at org.apache.hadoop.hive.ql.exec.TaskRunner.runSequential(TaskRunner.java:66)
  at org.apache.hadoop.hive.ql.Driver.launchTask(Driver.java:1383)
  at org.apache.hadoop.hive.ql.Driver.execute(Driver.java:1169)
  at org.apache.hadoop.hive.ql.Driver.run(Driver.java:982)
  at org.apache.hadoop.hive.ql.Driver.run(Driver.java:902)
  at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:259)
  at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:216)
  at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:412)
  at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:759)
  at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:613)
  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
  at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
  at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
  at java.lang.reflect.Method.invoke(Method.java:606)
  at org.apache.hadoop.util.RunJar.main(RunJar.java:208)
Job Submission failed with exception 'java.io.FileNotFoundException(File does not exist: /tmp/elasticsearch-hadoop-1.3.0.M1.jar)'
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.MapRedTask
 
 
#ok，再看看
bash-4.1$ hadoop fs -put elasticsearch-hadoop-1.3.0.M1.jar  /tmp/
bash-4.1$ hive -hiveconf hive.aux.jars.path=/tmp/elasticsearch-hadoop-1.3.0.M1.jar
Logging initialized using configuration in file:/etc/hive/conf.dist/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_28ea1fbc-dc3b-4e62-9f47-1a88eed30069_1310993479.txt
hive> select id,name from  user_source;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201312162220_0004, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312162220_0004
Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_201312162220_0004
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2013-12-17 01:36:28,086 Stage-1 map = 0%,  reduce = 0%
2013-12-17 01:36:34,141 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:35,162 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:36,177 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:37,184 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 0.88 sec
2013-12-17 01:36:38,204 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 0.88 sec
MapReduce Total cumulative CPU time: 880 msec
Ended Job = job_201312162220_0004
MapReduce Jobs Launched: 
Job 0: Map: 1   Cumulative CPU: 0.88 sec   HDFS Read: 247 HDFS Write: 24 SUCCESS
Total MapReduce CPU Time Spent: 880 msec
OK
NULL	NULL
NULL	NULL
NULL	NULL
NULL	NULL
Time taken: 25.999 seconds
 
#慢，数据怎么是空的，建成外表了(EXTERNAL)，没有设置默认的分隔符,好纠结
hive> drop table user_source;                                                                        
OK
Time taken: 0.649 seconds
hive> CREATE TABLE user_source  (id INT, name STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
OK
Time taken: 0.109 seconds
hive> LOAD DATA LOCAL INPATH '/tmp/files1.txt' INTO TABLE user_source;                              
Copying data from file:/tmp/files1.txt
Copying file: file:/tmp/files1.txt
Loading data to table default.user_source
Table default.user_source stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 29, raw_data_size: 0]
OK
Time taken: 0.348 seconds
hive> select * from  user_source;                                                                   
OK
1	medcl
2	lcdem
3	tom
4	jack
Time taken: 0.155 seconds
 
#源表现在有了，导入到ES所在的表里面去
 
hive> INSERT OVERWRITE TABLE user
    >     SELECT s.id, s.name FROM user_source s;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
Starting Job = job_201312162220_0005, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312162220_0005
Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_201312162220_0005
Hadoop job information for Stage-0: number of mappers: 1; number of reducers: 0
2013-12-17 01:50:52,141 Stage-0 map = 0%,  reduce = 0%
2013-12-17 01:51:03,220 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:04,243 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:05,254 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:06,266 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.16 sec
2013-12-17 01:51:07,294 Stage-0 map = 100%,  reduce = 100%, Cumulative CPU 1.16 sec
MapReduce Total cumulative CPU time: 1 seconds 160 msec
Ended Job = job_201312162220_0005
4 Rows loaded to user
MapReduce Jobs Launched: 
Job 0: Map: 1   Cumulative CPU: 1.16 sec   HDFS Read: 247 HDFS Write: 0 SUCCESS
Total MapReduce CPU Time Spent: 1 seconds 160 msec
OK
Time taken: 21.849 seconds
hive> select * from user;
OK
Failed with exception java.io.IOException:java.lang.IllegalStateException: [GET] on [/index/user/&search_type=scan&scroll=10m&size=50&preference=_shards:4;_only_node:MP7Zl3owTRm8O2V6cWvOSg] failed; server[http://10.0.2.15:9200] returned [{"_index":"index","_type":"user","_id":"&search_type=scan&scroll=10m&size=50&preference=_shards:4;_only_node:MP7Zl3owTRm8O2V6cWvOSg","exists":false}]
Time taken: 0.387 seconds
 
#可以看出来hadoop-elasticsearch翻译出来的查询语句好像有问题！不过elasticsearch里面已经有数据了，反正暂时不需要用hive来执行查询，先官方发个issue吧。
 
#ES查询结果
bash-4.1$ curl localhost:9200/index/user/_search?q=*&pretty=true
[1] 13588
bash-4.1$ {"took":3,"timed_out":false,"_shards":{"total":5,"successful":5,"failed":0},"hits":{"total":4,"max_score":1.0,"hits":[{"_index":"index","_type":"user","_id":"3x4bEcriRvS6AHkX2Sb7UA","_score":1.0, "_source" : {"id":2,"name":"lcdem"}},{"_index":"index","_type":"user","_id":"_3rGVWhaTSCixYxRzBUSLQ","_score":1.0, "_source" : {"id":4,"name":"jack"}},{"_index":"index","_type":"user","_id":"T-Q_icjgR8ehsH3IV-twWw","_score":1.0, "_source" : {"id":1,"name":"medcl"}},{"_index":"index","_type":"user","_id":"Vdz0sryBT5u0e9hfoMY8Tg","_score":1.0, "_source" : {"id":3,"name":"tom"}}]}}