org.apache.hadoop.hbase.ipc.ExecRPCInvoker$1@58e395e8,java.io.IOException: java.io.IOException: java.lang.NullPointerException
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager.createPassword(AuthenticationTokenSecretManager.java:129)
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager.createPassword(AuthenticationTokenSecretManager.java:57)
at org.apache.hadoop.security.token.Token.<init>(Token.java:70)
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager.generateToken(AuthenticationTokenSecretManager.java:162)
at org.apache.hadoop.hbase.security.token.TokenProvider.getAuthenticationToken(TokenProvider.java:91)
at sun.reflect.GeneratedMethodAccessor56.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.hbase.regionserver.HRegion.exec(HRegion.java:5610)
at org.apache.hadoop.hbase.regionserver.HRegionServer.execCoprocessor(HRegionServer.java:3918)
at sun.reflect.GeneratedMethodAccessor39.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.hbase.ipc.SecureRpcEngine$Server.call(SecureRpcEngine.java:311)
at org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1428)
publicvoidrun() {
zkLeader.start();
zkLeader.waitToBecomeLeader(); //没有成为leader的人会一直阻塞在这里,直到感知到当前leader挂掉才会开始新一轮竞争
isMaster = true;
while (!stopped) {
long now = EnvironmentEdgeManager.currentTimeMillis();
// clear any expired
removeExpiredKeys(); //清除过期的token,同时也把它从ZK上移除if (lastKeyUpdate + keyUpdateInterval < now) { //默认的周期是1天// roll a new master key
rollCurrentKey(); //就是这个函数产生新的token,替换currenKey
}
try {
Thread.sleep(5000);
} catch (InterruptedException ie) {
if (LOG.isDebugEnabled()) {
LOG.debug("Interrupted waiting for next update", ie);
}
}
}
}
AuthenticationTokenSecretManager:
synchronizedvoid rollCurrentKey() {
if (!leaderElector.isMaster()) {
LOG.info("Skipping rollCurrentKey() because not running as master.");
return;
}
long now = EnvironmentEdgeManager.currentTimeMillis();
AuthenticationKey prev = currentKey;
AuthenticationKey newKey = new AuthenticationKey(++idSeq,
Long.MAX_VALUE, // don't allow to expire until it's replaced by a new key
generateSecret());
allKeys.put(newKey.getKeyId(), newKey);
currentKey = newKey; //滚动currentKey,置为newKey
zkWatcher.addKeyToZK(newKey); //把新的token放到zookeeper
lastKeyUpdate = now;
if (prev != null) {
// make sure previous key is still stored
prev.setExpiration(now + tokenMaxLifetime); //prev是原来的newKey,是不会过期的,当有新的newKey替代它后,它的期限默认设置是7天
allKeys.put(prev.getKeyId(), prev);
zkWatcher.updateKeyInZK(prev);
}
}
3.既然token是由leader生产的,除非没有leader,才会没人生产。验证这个想法,我在zookeeper和一些region server启动当天的日志里找到了证据:
a) zk中的 /hbase/tokenauth/keymaster 节点用来存放leader的信息,然后进入zookeeper-client查看了下,根本没这个节点。
b) 一些尚有保留集群启动当天日志的region server上找到了如下异常:
org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager: Zookeeper initialization failed
org.apache.zookeeper.KeeperException$NoAuthException: KeeperErrorCode = NoAuth for /hbase/tokenauth/keys
at org.apache.zookeeper.KeeperException.create(KeeperException.java:113)
at org.apache.zookeeper.KeeperException.create(KeeperException.java:51)
at org.apache.zookeeper.ZooKeeper.create(ZooKeeper.java:783)
at org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.createNonSequential(RecoverableZooKeeper.java:421)
at org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.create(RecoverableZooKeeper.java:403)
at org.apache.hadoop.hbase.zookeeper.ZKUtil.createWithParents(ZKUtil.java:1164)
at org.apache.hadoop.hbase.zookeeper.ZKUtil.createWithParents(ZKUtil.java:1142)
at org.apache.hadoop.hbase.security.token.ZKSecretWatcher.start(ZKSecretWatcher.java:58)
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager.start(AuthenticationTokenSecretManager.java:105)
at org.apache.hadoop.hbase.ipc.SecureRpcEngine$Server.startThreads(SecureRpcEngine.java:275)
at org.apache.hadoop.hbase.ipc.HBaseServer.start(HBaseServer.java:1650)
at org.apache.hadoop.hbase.regionserver.HRegionServer.startServiceThreads(HRegionServer.java:1728)
at org.apache.hadoop.hbase.regionserver.HRegionServer.handleReportForDutyResponse(HRegionServer.java:1105)
at org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:753)
at java.lang.Thread.run(Thread.java:662)
这是AuthenticationTokenSecretManager启动时候失败了,启动的时候会先在ZK上创建/hbase/tokenauth/keys这个目录(即便这个目录已经存在也会执行这个操作,这是一种保证),这个目录用来存放leader生成的token。结果大家都没有/hbase/tokenauth的权限,所以都失败了(NoAuth for /hbase/tokenauth/keys,这里的提示有点瑕疵,实际上/hbase/tokenauth没有权限导致的)。然而发生这样的严重错误,server的启动并没有被终止,而是继续运行下去,留下了隐患。
AuthenticationTokenSecretManager:
publicvoidstart() {
try {
// populate any existing keysthis.zkWatcher.start(); //这里抛出的KeeperException // try to become leaderthis.leaderElector.start(); //这里竞争leader,但是因为异常这里不会被执行,所以没有人去竞争leader
} catch (KeeperException ke) {
LOG.error("Zookeeper initialization failed", ke); //发生异常,仅仅是打印一条error信息,而没有abort。在Hbase的很多地方,发生这样的错误都是会abort server的。
}
}
2015-08-25 14:35:08,273 DEBUG org.apache.hadoop.hbase.zookeeper.ZKLeaderManager: Claimed the leader znode as 'SVR4050HW2285.hadoop.xxx.com,60020,1440397852179'
2015-08-25 14:35:08,288 FATAL org.apache.hadoop.hbase.regionserver.HRegionServer: ABORTING region server SVR4050HW2285.hadoop.xxx.com,60020,1440397852179: Unable to synchronize secretkey 3 in zookeeper
org.apache.zookeeper.KeeperException$NoAuthException: KeeperErrorCode = NoAuth for /hbase/tokenauth/keys/3
at org.apache.zookeeper.KeeperException.create(KeeperException.java:113)
at org.apache.zookeeper.KeeperException.create(KeeperException.java:51)
at org.apache.zookeeper.ZooKeeper.setData(ZooKeeper.java:1266)
at org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.setData(RecoverableZooKeeper.java:349)
at org.apache.hadoop.hbase.zookeeper.ZKUtil.updateExistingNodeData(ZKUtil.java:814)
at org.apache.hadoop.hbase.security.token.ZKSecretWatcher.updateKeyInZK(ZKSecretWatcher.java:197)
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager.rollCurrentKey(AuthenticationTokenSecretManager.java:257)
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager$LeaderElector.run(AuthenticationTokenSecretManager.java:317)
privatestatic ArrayList<ACL> createACL(ZooKeeperWatcher zkw, String node) {
if (isSecureZooKeeper(zkw.getConfiguration())) {
// Certain znodes are accessed directly by the client,// so they must be readable by non-authenticated clientsif ((node.equals(zkw.baseZNode) == true) ||
(node.equals(zkw.rootServerZNode) == true) ||
(node.equals(zkw.masterAddressZNode) == true) ||
(node.equals(zkw.clusterIdZNode) == true) ||
(node.equals(zkw.rsZNode) == true) ||
(node.equals(zkw.backupMasterAddressesZNode) == true) ||
(node.startsWith(zkw.assignmentZNode) == true) ||
(node.startsWith(zkw.masterTableZNode) == true) ||
(node.startsWith(zkw.masterTableZNode92) == true)) {
return ZooKeeperWatcher.CREATOR_ALL_AND_WORLD_READABLE;
}
return Ids.CREATOR_ALL_ACL;
} else {
return Ids.OPEN_ACL_UNSAFE;
}
}
/hbase/tokenauth及其子节点显然使用的是CREATOR_ALL_ACL权限。那4048创建了key,然后又挂掉的话,那其它机子显然不可能成为leader。这种权限设定似乎有点不科学。
因为B环境权限都很正常的,没出什么问题,我又对比了下A和B的权限和配置。
B leader生产的token的权限:
2015-08-25 14:33:01,515 FATAL org.apache.hadoop.hbase.security.token.ZKSecretWatcher: Unable to synchronize master key 4 to znode /hbase/tokenauth/keys/4
org.apache.zookeeper.KeeperException$NoAuthException: KeeperErrorCode = NoAuth for /hbase/tokenauth/keys/4
at org.apache.zookeeper.KeeperException.create(KeeperException.java:113)
at org.apache.zookeeper.KeeperException.create(KeeperException.java:51)
at org.apache.zookeeper.ZooKeeper.create(ZooKeeper.java:783)
at org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.createNonSequential(RecoverableZooKeeper.java:421)
at org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper.create(RecoverableZooKeeper.java:403)
at org.apache.hadoop.hbase.zookeeper.ZKUtil.createWithParents(ZKUtil.java:1164)
at org.apache.hadoop.hbase.zookeeper.ZKUtil.createSetData(ZKUtil.java:868)
at org.apache.hadoop.hbase.security.token.ZKSecretWatcher.addKeyToZK(ZKSecretWatcher.java:180)
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager.rollCurrentKey(AuthenticationTokenSecretManager.java:250)
at org.apache.hadoop.hbase.security.token.AuthenticationTokenSecretManager$LeaderElector.run(AuthenticationTokenSecretManager.java:317)
2015-08-25 14:33:01,516 FATAL org.apache.hadoop.hbase.regionserver.HRegionServer: ABORTING region server SVR4048HW2285.hadoop.xxx.com
,60020,1440397852099: Unable to synchronize secret key 4 in zookeeper
synchronizedvoid rollCurrentKey() {
if (!leaderElector.isMaster()) {
LOG.info("Skipping rollCurrentKey() because not running as master.");
return;
}
long now = EnvironmentEdgeManager.currentTimeMillis();
AuthenticationKey prev = currentKey;
AuthenticationKey newKey = new AuthenticationKey(++idSeq, //新token的id比上一次大一
Long.MAX_VALUE, // don't allow to expire until it's replaced by a new key
generateSecret());
allKeys.put(newKey.getKeyId(), newKey);
currentKey = newKey;
zkWatcher.addKeyToZK(newKey); //试图向zk写入新token
lastKeyUpdate = now;
if (prev != null) {
// make sure previous key is still stored
prev.setExpiration(now + tokenMaxLifetime);
allKeys.put(prev.getKeyId(), prev);
zkWatcher.updateKeyInZK(prev);
}
}