Java API实现向Hive批量导入数据 - Hive

TOP

Java API实现向Hive批量导入数据

2018-12-05 01:11:51 【大中小】浏览:640次

Java程序中产生的数据，如果导入oracle或者mysql库，可以通过jdbc连接insert批量操作完成，但是当前版本的hive并不支持批量insert操作，因为需要先将结果数据写入hdfs文件，然后插入Hive表中。

package com.enn.idcard;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
 * <p>Description: </p>
 * @author kangkaia
 * @date 2017年12月26日 下午1:42:24
 */
public class HiveJdbc {

    public static void main(String[] args) throws IOException {
    	List<List> argList = new ArrayList<List>();
		List<String> arg = new ArrayList<String>();
		arg.add("12345");
		arg.add("m");
		argList.add(arg);
		arg = new ArrayList<String>();
		arg.add("54321");
		arg.add("f");
		argList.add(arg);
//		System.out.println(argList.toString());
		String dst = "/test/kk.txt";
		createFile(dst,argList);
		loadData2Hive(dst);
    }

    /**
     * 将数据插入hdfs中，用于load到hive表中，默认分隔符是"\001"
     * @param dst
     * @param contents
     * @throws IOException
     */
    public static void createFile(String dst , List<List> argList) throws IOException{
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path dstPath = new Path(dst); //目标路径
        //打开一个输出流
        FSDataOutputStream outputStream = fs.create(dstPath);
        StringBuffer sb = new StringBuffer();
        for(List<String> arg:argList){
			for(String value:arg){
				sb.append(value).append("\001");
			}
			sb.deleteCharAt(sb.length() - 4);//去掉最后一个分隔符
			sb.append("\n");
		}
        sb.deleteCharAt(sb.length() - 2);//去掉最后一个换行符
        byte[] contents =  sb.toString().getBytes();
        outputStream.write(contents);
        outputStream.close();
        fs.close();
        System.out.println("文件创建成功！");
        
    }
    /**
     * 将HDFS文件load到hive表中
     * @param dst
     */
    public static void loadData2Hive(String dst) {
    	String JDBC_DRIVER = "org.apache.hive.jdbc.HiveDriver";
    	String CONNECTION_URL = "jdbc:hive2://server-13:10000/default;auth=noSasl";
    	String username = "admin";
        String password = "admin";
        Connection con = null;
		
		try {
			Class.forName(JDBC_DRIVER);
			con = (Connection) DriverManager.getConnection(CONNECTION_URL,username,password);
			Statement stmt = con.createStatement();
			
			String sql = " load data inpath '"+dst+"' into table population.population_information ";
			
			stmt.execute(sql);
			System.out.println("loadData到Hive表成功！");
		} catch (SQLException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}finally {
			// 关闭rs、ps和con
			if(con != null){
				try {
					con.close();
				} catch (SQLException e) {
					e.printStackTrace();
				}
			}
		}
	}
    
}

注意：本例使用mvn搭建，conf配置文件放在src/main/resources目录下。

Hive提供的默认文件存储格式有textfile、sequencefile、rcfile等。用户也可以通过实现接口来自定义输入输的文件格式。

在实际应用中，textfile由于无压缩，磁盘及解析的开销都很大，一般很少使用。Sequencefile以键值对的形式存储的二进制的格式，其支持针对记录级别和块级别的压缩。rcfile是一种行列结合的存储方式（text file和sequencefile都是行表[row table]），其保证同一条记录在同一个hdfs块中，块以列式存储。一般而言，对于OLTP而言，行表优势大于列表，对于OLAP而言，列表的优势大于行表，特别容易想到当做聚合操作时，列表的复杂度将会比行表小的多，虽然单独rcfile的列运算不一定总是存在的，但是rcfile的高压缩率确实减少文件大小，因此实际应用中，rcfile总是成为不二的选择，达观数据平台在选择文件存储格式时也大量选择了rcfile方案。

通过hdfs导入hive的表默认是textfile格式的，因此可以改变存储格式，具体方法是先创建sequencefile、rcfile等格式的空表，然后重新插入数据即可。

insert overwrite table seqfile_table select * from textfile_table; 
……
insert overwrite table rcfile_table select * from textfile_table;


【大中小】【打印】【繁体】【投稿】【收藏】【推荐】【举报】【评论】【关闭】【返回顶部】

上一篇：MySQL同步到Hive操作步骤	下一篇：Hive的字符串函数