千家信息网

Hive创建表及插入数据demo

发表于:2025-01-24 作者:千家信息网编辑
千家信息网最后更新 2025年01月24日,create table student(id int comment "学生id",name string comment "学生姓名",age int comment "学生年龄")comment
千家信息网最后更新 2025年01月24日Hive创建表及插入数据demo

create table student(id int comment "学生id",name string comment "学生姓名",age int comment "学生年龄")
comment "学生信息表"
row format delimited fields terminated by ",";

create external table student_ext(id int comment "学生id",name string comment "学生姓名",age int comment "学生年龄")
comment "学生信息表"
row format delimited fields terminated by ","
location "/user/hive/student_ext";

create external table student_ptn(id int comment "学生id",name string comment "学生姓名",age int comment "学生年龄")
comment "学生信息表"
partitioned by (city string)
row format delimited fields terminated by ","
location "/user/hive/student_ptn";

set hive.exec.dynamici.partition=true; #开启动态分区,默认是false
set hive.exec.dynamic.partition.mode=nonstrict; #开启允许所有分区都是动态的,否则必须要有静态分区才能使用。

set hive.exec.dynamic.partition=true;(可通过这个语句查看:set hive.exec.dynamic.partition;)
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=100000;(如果自动分区数大于这个参数,将会报错)
SET hive.exec.max.dynamic.partitions.pernode=100000;

insert into table student_ptn partition(city) select 6,"yangdong",29,"beijing";
insert into table student_ptn partition(city) select 2,"limei",22,"chongqing";
insert into table student_ptn partition(city) select 3,"wangxing",25,"beijing";
insert into table student_ptn partition(city) select 4,"chenming",22,"beijing";
insert into table student_ptn partition(city) select 5,"xiali",26,"chongqing";

create external table student_bck(id int comment "学生id",name string comment "学生姓名",age int comment "学生年龄")
comment "学生信息表"
clustered by(id) sorted by(id asc) into 2 buckets
row format delimited fields terminated by ","
location "/user/hive/student_bck";

insert into table student_bck
select * from student;

create table cdt(
id int,
name string,
work_location array,
piaofang map,
address struct)
row format delimited
fields terminated by "\t"
collection items terminated by ","
map keys terminated by ":"
lines terminated by "\n";

将json字符串加载到table json中
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}

CREATE TABLE json(
data string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://hmaster:9000/user/hive/warehouse/plusorg.db/json'
TBLPROPERTIES (
'transient_lastDdlTime'='1542008332')

load data local inpath "/root/json.txt" into table json;
get_json_object(data,'$.movie') 内置函数解析某列数据
select get_json_object(data,'$.movie') as movie from json;
json_tuple(jsonStr, k1, k2, ...) 内置函数解析json字符串类数据
参数为一组键k1,k2……和JSON字符串,返回值的元组。该方法比 get_json_object 高效,因为可以在一次调用中输入多个键

select
b.b_movie,
b.b_rate,
b.b_timeStamp,
b.b_uid
from json a
lateral view json_tuple(a.data,'movie','rate','timeStamp','uid') b as b_movie,b_rate,b_timeStamp,b_uid;

create table rate(movie int, rate int, unixtime int, userid int) row format delimited fields
terminated by '\t';

insert into table rate select
get_json_object(data,'$.movie') as moive,
get_json_object(data,'$.rate') as rate,
get_json_object(data,'$.timeStamp') as unixtime,
get_json_object(data,'$.uid') as userid
from json;

select from_unixtime(unixtime,'yyyy/MM/dd HH:mm:ss') from rate;

create table lastjsontable(movie int, rate int, utime date, userid int) row format delimited
fields terminated by '\t';
添加Python脚本,hive即可访问,路径为在当前Unix服务器存储绝对路径
add file /home/pythoncode/WeekdayMapper.py;

insert into table lastjsontable
select
transform(movie,rate,unixtime,userid) #输入值(基表)
using 'python WeekdayMapper.py' #使用脚本清洗
as(movie,rate,utime,userid) #输出值(子表)
from rate; #基表

0