hive数据库的DDL
本地模式开启
set hive.exec.mode.local.auto=true; //开启本地mr
//设置local mr的最大输入数据量,当输入数据量小于这个值时采用local mr的方式,默认为134217728,即128M
set hive.exec.mode.local.auto.inputbytes.max=50000000;
//设置local mr的最大输入文件个数,当输入文件个数小于这个值时采用local mr的方式,默认为4
set hive.exec.mode.local.auto.input.files.max=10;
create database text; //创建一个text库
create database if not exists text; //如果这个数据库已经存在,就不创建
show databases; //查看有多少数据库
desc database test; //查看数据库信息
desc database extended test; //查看更详细的信息
修改数据库 alter DATABASE test set dbproperties () //只能修改数据库的属性
drop database text; //删除数据库
drop database text cascade; //强删数据库 !!!慎用表的DDL
建表流程
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name ---内外部表
[(col_name data_type [COMMENT col_comment], ...)] ---列名,类型,注释
[COMMENT table_comment] ---表注释
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] ---分区表
[CLUSTERED BY (col_name, col_name, ...) ---分桶表
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] ---分桶表
[ROW FORMAT row_format] ---指定数据解析形式
[STORED AS file_format] ---指定文件存储格式
[LOCATION hdfs_path] ---指定对应文件夹
[TBLPROPERTIES (property_name=property_value, ...)] ---表属性
[AS select_statement] ---通过查询结果建表
[LIKE table_name] ---拷贝表结构
创建表
create table student2( ---创建表
id int,
name string
)
row format delimited fields terminated by '\t' ---字段与字段的分隔符为\t
collection items terminated by " "
map keys terminated by ":"
location "路径" ---指定路径 如果不指定出现在数据库下面的文件夹
查看表
show tables ; ---查看数据表
desc tables 表名 ---查看数据结构
desc formatted student2; ---查看详细的表结构
create table 表名 as
select name,age from test; ----根据查询结果后创建表
alter table student1 change column id idi bigint comment 'idi'; ---修改列信息
alter table student1 add columns (cd int comment "cd"); ---追加列信息
alter table student1 replace columns (id string,name string comment "cd"); ---替换信息
drop table student1; ---删除表
truncate table student1; ---清除数据数据的DML
---从本地文件系统将数据导入表格
load data local inpath "liunx路径" into table student1; ---从本地插入数据
load data local inpath "liunx路径" overwrite into table student1; ---从本地覆盖导入数据
--- 从HDFS将数据导入表格 (覆盖)(会将数据移动进表格的数据文件夹)
load data inpath "HDFS路径" overwrite into table student1;
---insert插入
insert into student1 values (100,"daf");
---insert将查询结果 插入
insert into student1 select id,name from student2 where id<1000;
---insert 覆盖的方式将查询结果 插入
insert overwrite table student2 select id,name from student2 where id<1000;
---将查询插入数据
create table student3 as select id,name from student2 where id<1000;
---通过location指定地址加载数据
---将HDFS的的数据插入 表
create external table student4 (id int,name string) row format delimited fields terminated by "\t" location "/路径";---将查询的结果导出到本地
insert overwrite local directory '/opt/module/hive/datas/export/student' select * from student;
---将查询的结果格式化导出到本地
insert overwrite local directory '/opt/module/hive/datas/export/student'
row format delimited fields terminated by '\t'
select * from student;
---将查询的结果导出到HDFS上(没有local)
insert overwrite local directory '/opt/module/hive/datas/export/student'
row format delimited fields terminated by '\t'
select * from student;
---Export导出到HDFS上
export table default.student to
'/user/hive/warehouse/export/student';
import table stuin from "路径";外部表
1)理论
因为表是外部表,所以Hive并非认为其完全拥有这份数据。删除该表并不会删除掉这份数据,不过描述表的元数据信息会被删除掉。
2)管理表和外部表的使用场景
每天将收集到的网站日志定期流入HDFS文本文件。在外部表(原始日志表)的基础上做大量的统计分析,用到的中间表、结果表使用内部表存储,数据通过SELECT + INSERT进入内部表。
create external table if not exists teacher(
id int,
name string
)
row format delimited fields terminated by '\t'
location '/school/teacher';
------修改内部表student为外部表
alter table student set tb lproperties('EXTERNAL'='TRUE');
------修改外部表student为内部表
alter table student set tblproperties('EXTERNAL'='FALSE');基本查询和分组过滤
10,ACCOUNTING,1700
20,RESEARCH,1800
30,SALES,1900
40,OPERATIONS,1700
create table if not exists dept(
deptno int, -- 部门编号
dname string, -- 部门名称
loc int -- 部门位置
)
row format delimited fields terminated by '\t';
7369,SMITH,CLERK,7902,
7499,ALLEN,SALESMAN,7698,
7521,WARD,SALESMAN,7698,
7566,JONES,MANAGER,7839,
7654,MARTIN,SALESMAN,7698,
7698,BLAKE,MANAGER,7839,
7782,CLARK,MANAGER,7839,
7788,SCOTT,ANALYST,7566,
7839,KING,PRESIDENT,,
7844,TURNER,SALESMAN,7698,
7876,ADAMS,CLERK,7788,
7900,JAMES,CLERK,7698,
7902,FORD,ANALYST,7566,
7934,MILLER,CLERK,7782,
create table if not exists emp(
empno int, -- 员工编号
ename string, -- 员工姓名
job string, -- 员工岗位(大数据工程师、前端工程师、java工程师)
sal double, -- 员工薪资
deptno int -- 部门编号
)
row format delimited fields terminated by '\t';
---查询特定列
select ename,sal from emp;
--起别名 as可以省略
select ename as name,sal from emp;
---工资提10块
selelct ename,sal+10 from emp;
---常用函数(UDAF)执行的时候NULL不参与计算
----求总行数(count)
select count(*) cnt from emp;
----求工资的最大值(max)
select max(sal) max_sal from emp;
----求工资的最小值(min)
select min(sal) min_sal from emp;
----求工资的总和(sum)
select sum(sal) sum_sal from emp;
----求工资的平均值(avg)
select avg(sal) avg_sal from emp;
---显示前五名
select ename from emp limit 5;
---显示2-5名
select ename from emp limit 1,4;
---where 过滤条件
select ename,sal from emp where sal>1000;
---安全等与 <=> 演示 对比下面的两句查询
select ename,job from emp where job = nunll;
select ename,job from emp where job <=> nunll;
---查询工资在500到1000的员工信息
select * from emp where sal between 500 and 1000;
----(3)查询job为空的所有员工信息
select * from emp where job is null;
-----查询工资是1500或5000的员工信息
select * from emp where sal IN (1500, 5000);
----1)查找名字以S开头的员工信息
select
*
from emp
where ename LIKE '小%';
---等价
select
*
from emp
where ename RLIKE '^小';
----查询薪水大于1000,部门是30
select * from emp where sal > 1000 and deptno = 30;
----查询薪水大于1000,或者部门是30
select * from emp where sal>1000 or deptno=30;
----查询除了20部门和30部门以外的员工信息
select * from emp where deptno not IN(30, 20);
分组与分组过滤
---计算emp表每个部门的平均工资。
select
t.deptno,
avg(t.sal) avg_sal
from emp t
group by t.deptno;
----计算emp每个部门中每个岗位的最高薪水。
select
t.deptno,
t.job,
max(t.sal) max_sal
from emp t
group by t.deptno, t.job;
---求每个部门的平均薪水大于2000的部门。
select
deptno,
avg(sal) avg_sal
from emp
group by deptno
having avg_sal > 2000; ---不能使用where 因为 where在group by 之前
连接
(连接相当于把两张表进行一个合并)
----查询员工姓名,员工部门名称
----内连接
select emp.ename,dept.dname from emp inner join dept on emp.deptno=dept.deptno;
为了演示多种连接,需要插入数据
---左连接
select emp.ename,dept.dname from emp left join dept on emp.deptno=dept.deptno;
内连接是job的数据保留
左连接与右连接的区别,左边为主体保存,保存左边没有job的主题。右边为主体保存,保存右边没有job的主题
---右连接
select emp.ename,dept.dname from emp right join dept on emp.deptno=dept.deptno;
---全连接 则是全部显示
select emp.ename,dept.dname from emp full join dept on emp.deptno=dept.deptno;
多表连接
---多表连接
create table if not exists location(
loc int, -- 部门位置id
loc_name string -- 部门位置
)
row format delimited fields terminated by '\t';
1700,北京
1800,上海
1900,深圳
---查询员工姓名,部门名称,单位名称
select e.ename,
d.dname,
l.loc_name
from emp e
join dept d on e.deptno = d.deptno
join location l on d.loc = l.loc
---查询员工姓名,部门名称,单位名称
select e.ename,
d.dname,
l.loc_name
from emp e
left join dept d on e.deptno = d.deptno
right join location l on d.loc = l.loc
---笛卡尔积(不写连接条件会产生笛卡尔积)
select * from emp join dept;
---注意下面的情况
insert into dept values(40,"xxxx",1800);
---测试
select emp.ename,dept.dname from emp inner join dept on emp.deptno=dept.deptno;
会出现17行
只要连接都会产生笛卡尔积 普遍存在的 不过一般情况都是主键唯一的排序
---按照工资从高到低进行排序
select * from emp order by sal desc;
---按照别名排序
select ename,scl salary from emp order by salary desc;
---二次排序
先按照部门编号升序排序,相同部门按照工资进行降序排序 按照两个排序
select * from emp order by deptno,sal desc;
---按照部门平均工资降序排序
select deptno,avg(sal) avg_sal from emp group by deptno order by avg_sal desc;
---全局排序最大可能会遇到性能问题,所以排序一般结合limit使用
select * from emp order by sal desc limit 5;---需要设置一个动态参数
set mapreduce.job.reduces=3; //启动3个reduces 数据就会进入3个区里面 是哈希值
---distribute by 指定查询 按照什么字段的hash分区,sort by 在分区内部分区
select * from emp distribute by empno sort by sal desc;
---当distribute by 和 sort by 字段相同时,可以用cluster by 代替
select * from emp distribute by empno sort by empno;
相当于
select * from emp cluster by empno;函数
分系统函数和自定义函数
---查询所有系统函数
show functions;
---查询包含特定关键字的函数
show functions like "*date";
---查询特定函数的使用方法
desc function 'current_date';
---查询特定函数更详细的使用帮助
desc function extended 'current_date';空值替换
---- 两个空值替换
---nvl(col,default_value) 如果col不为null 返回col 否则返回defailt_value
select ename,job,nvl(job,"没工作") from emp;
---多个输入
---coalesce (col1,col2...) 从左到右找第一个不为null的值 如果没有job就找sal 如果都没有 返回啥也没有
select ename,job,sal,coalesce(job,sal,"啥也没有") from emp;分支控制
---数据准备
create table emp_sex(
name string, --姓名
dept_id string, --部门id
sex string --性别
)
row format delimited fields terminated by "\t"; //注意是 \t
悟空,A,男
大海,A,男
宋宋,B,男
凤姐,A,女
婷姐,B,女
婷婷,B,女
---if(boolean,resultl,result2) 如果Bollean为真,返回resultl,否则返回resultl2
---查询求出不同部门男女各多少人
select
dept_id,count(name) cnt,
count(`if`(sex="男",name,null)) male,
count(`if`(sex="女",name,null)) female
from emp_sex
group by dept_id;
---第一种写法
---case() col when valuel then resultl
---when value2 then result2
---else resultl3
---end
---如果col值为valuel,返回resultl,如果值为value2 返回 result2 否则返回result3
---第二种写法
---case when
---booleanl then result1
---boolean2 then result2
---else resilt3
---end
---如果boolean1为真,返回result1,如果boolean1为假,boolean2为真,返回result2,否则返回resilit3
select
dept_id,count(name) cnt,
count(case sex when '男' then name else null end) male,
count(case when sex='女' then name else null end) female
from emp_sex
group by dept_id;
行转列和列转行
行转列
---数据准备
孙悟空,白羊座,A
大海,射手座,A
宋宋,白羊座,B
猪八戒,白羊座,A
凤姐,射手座,A
苍老师,白羊座,B
create table person_info(
name string, --姓名
constellation string, --星座
blood_type string --血缘
)
row format delimited fields terminated by "\t";
---计算相同星座血型各有多少人
select constellation,blood_type,
count(name) cnt
from person_info
group by constellation,blood_type;
---把星座和血型一样的人归类到一起
---行转列就是多行转为一列 有主要的函数collect_set(col) collect_list(col)
---这个两个函数都是聚合函数,作用是将属于同一组的col的值合并成一个数组 (set去重,list不去重)
select constellation,blood_type,
count(name) cnt,
collect_list(name) names,
collect_set(name) names_dedup
from person_info
group by constellation,blood_type;
射手座,A,3,"[""大海"",""大海"",""凤姐""]","[""大海"",""凤姐""]"
白羊座,A,2,"[""孙悟空"",""猪八戒""]","[""孙悟空"",""猪八戒""]"
白羊座,B,2,"[""宋宋"",""苍老师""]","[""宋宋"",""苍老师""]"
---插入数据进行一个测试
insert into person_info values ('大海','射手座','A')
---把星座和血型一样的人归类到一起
---字符串拼接
---concat(v1,v2...) 将输入的多列拼成一列字符串输出v1,v2
---concat_ws(sep,array) 将数组内的多个元素拼接成字符串,按照sep分割
select concat(constellation,",",blood_type) xzxx,
concat_ws("|",collect_set(name))
from person_info
group by constellation,blood_type
"射手座,A",大海|凤姐
"白羊座,A",孙悟空|猪八戒
"白羊座,B",宋宋|苍老师
列转行
create table movie_info(
movie string, --电影名称
category string --电影分类
)
row format delimited fields terminated by "\t";
---数据准备
《疑犯追踪》,"悬疑,动作,科幻,剧情"
《Lie to me》,"悬疑,警匪,动作,心理,剧情"
《战狼2》,"战争,动作,灾难"
---explode(array|map) UDTF 函数(表审生成函数) 可以将一行数据变成多行多列
---如果数据的参数是array,结果只有一列,如果输入的参数的参数是map 结果有两列key,value两列
---split(str,sep) 将str按照sep分成字符串序列
select
explode(split(category,",")) yp
from movie_info;
---列转行:lateral view
---将原表和UDT结合查询
select movie,
yp.creid
from movie_info m
lateral view explode(split(category,",")) yp as creid;
---以上是固定格式
《疑犯追踪》,悬疑
《疑犯追踪》,动作
《疑犯追踪》,科幻
《疑犯追踪》,剧情
《Lie to me》,悬疑
《Lie to me》,警匪
《Lie to me》,动作
《Lie to me》,心理
《Lie to me》,剧情
《战狼2》,战争
《战狼2》,动作
《战狼2》,灾难窗口函数
1.给聚合函数开窗口的状况(明细查询中,展示汇总结果)
---窗口函数就是展示明细查询中展示都是用窗口函数
---数据准备
create table business(
name string,
orderdate string,
cost int
)
row format delimited fields terminated by ',';
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
---查询在2017年4月份购买过的顾客及总人数
select name,from business where substring(orderdate,1,7) = '2017-04';
---substring(str,pos,len) 返回str字符串从pos(下标从1开始)位置开始长度为len的子串
select name from business where substring(orderdate,1,7) = '2017-04' group by name;
等价
select distinct name from business where substring(orderdate,1,7) = '2017-04';
---如果想求2017年4月购买的总人数
select count(distinct name) from business where substring(orderdate,1,7) = '2017-04';
---如果想将一句sql两张结果放在一张表里面展示
select distinct name,
count(distinct name) over()
from business
where substring(orderdate,1,7)='2017-04'
---查询顾客的购买明细及月购买总额
select name,
orderdate,
cost,
sum(cost) over(partition by substring(orderdate,1,7)) ---月购买总额
from business;
---查询每个顾客的截止到当日的累计消费
select * from business order by name,orderdate;
select name,
orderdate,
cost,
sum(cost) over(partition by name order by orderdate rows between unbounded preceding and current row )
from business;
---统计最近两天的总额
select name,
orderdate,
cost,
sum(cost) over(partition by name order by orderdate rows between 1 preceding and current row ) ---最近两次
from business;
---小题目
---购买明细和截止当日来过店的顾客
select name,orderdate,cost,
collect_set(name) over(order by orderdate rows between 1 preceding and current row )
from business;2.结合有序窗口使用的函数
---lag/lead
---lag(col,n,default_value) over(有序窗口)
---查询购买明细和每个人上一次的到店时间
---显示col这一列n行之前的数据,如果没有,展示default_value
select name,orderdate,cost,
lag(orderdate,1,"1970-01-01") over (partition by name order by orderdate) las
from business;
---leag(col,n,default_value) over(有序窗口)
---显示col这一列n行之后的数据,如果没有,展示default_value
---展示下一次到店时间
select name,orderdate,cost,
lag(orderdate,1,"1970-01-01") over (partition by name order by orderdate) las,
lead(orderdate,1,"2050-01-01") over (partition by name order by orderdate) lasd
from business;
---ntile
---ntile(n) over(有序窗口)
---将数据分为n组,返回当前的组号
---将订单明细按照下单时间分5组
select name,orderdate,cost,
ntile(5) over(order by orderdate) b
from business;
---如果想查询前20%的订单,怎么查
select name,orderdate,cost from (
select name,orderdate,cost,
ntile(5) over(order by orderdate) z
from business) t1
where z=1;rank/dense_rank/row_number
create table score(
name string,
subject string,
score int
)
row format delimited fields terminated by "\t";
孙悟空,语文,87
孙悟空,数学,95
孙悟空,英语,68
大海,语文,94
大海,数学,56
大海,英语,84
宋宋,语文,64
宋宋,数学,86
宋宋,英语,84
婷婷,语文,65
婷婷,数学,85
婷婷,英语,78
---rank over (有序窗口)
---rank 是我们常见的排序
---dense_rank 不会跳过排名
---row_number 是行号
select name,subject,score,
rank() over(partition by subject order by score desc) rank,
dense_rank() over(partition by subject order by score desc) dense_rank,
row_number() over(partition by subject order by score desc) row_number
from score;
---查询各科前三名
select name,subject,score
from(select name,subject,score,
rank() over (partition by subject order by score) ran
from score)t1 where ran <=3;开窗练习
create table sales(
id string, ---商品id
category_id string, ---商品分类id
sales_sum int ---商品销售数量
);
----这是数据准备 随机生成的一个数据的sql
insert overwrite table sales
select pos,
`floor`(rand(1231239819)*9) + 1,
`floor`(rand(1231457656)*10000) + 100
from (
select posexplode(split(repeat("a",5000),"a"))
)t1
where pos <> 0;
select * from sales;
----找到某个品类的前10销量
select category_id,id,ran
from (
select id,category_id,sales_sum,
rank() over (partition by category_id order by sales_sum desc) ran
from sales)t1
where ran <=10;
---各品类前10商品
---首先求前10商品详情 与总额
select category_id,
collect_list(id) ids,
sum(sales_sum) sum_sc
from (
select id,category_id,sales_sum,
rank() over (partition by category_id order by sales_sum desc) ran
from sales)t1
where ran <=10
group by category_id;
---首先求前10商品详情 所有总额
select category_id,
collect_list(if(ran<10,id,null)) ids,
sum(sales_sum) sum_sc
from (
select id,category_id,sales_sum,
rank() over (partition by category_id order by sales_sum desc) ran
from sales)t1
group by category_id;first_value/last_value
---first_value(col,boolean) over(有序窗口)
---返回这个窗口中col中的第一行 boolean为true,返回这个窗口中col不为null的第一行
---lastst_value(col,boolean) over(有序窗口)
---返回这个窗口中col中的第一行 boolean为true,返回这个窗口中col不为null的最后一行
---查询business表明细,以及截止消费当日每日的第一笔和最后一笔大于50元的消费日期
select
name,
orderdate,
cost,
first_value(if(cost>50,orderdate,null),true) over(partition by name order by orderdate rows between unbounded preceding and current row ) fir,
last_value(if(cost>50,orderdate,null),true) over(partition by name order by orderdate rows between unbounded preceding and current row ) fls
from business
order by name,orderdate;
其他常用函数
日期相关函数
---current_date() 返回当前日期
select `current_date`();
---date_add(date,n) 返回date开始的n天之后的日期
select date_add(`current_date`(),1); 返回今天的日期加1
---date_sub(date,n) 返回date开始n天之后的日期
select date_sub(`current_date`(),1);
---datediff(date1,date2) 返回date1-date2的日期差
select datediff(`current_date`(),'2022-06-13');
---date_format
---date_format(date/timestamp/string,fmt) 将一个date/timestamp/string变量转化为fmt需要的格式
---fmt: 'yyyy-MM-dd HH:mm:ss'
select date_format(current_date(),'yyyy/MM')----yean(date) 返回日期的年份
select year(`current_date`());
----month(date) 返回日期的月份
select month(`current_date`());
----day(date) 返回日期的日
select day(`current_date`());
----dayofweek(date) 返回日期的星期几(星期日的第一天)
select `dayofweek`(`current_date`());
----weekofyear(date) 返回今天属于今年的第几周
select weekofyear(`current_date`());时间相关函数
---current_timestamp() 显示sql执行时的时间戳
select current_timestamp()
----当前时间戳进行一个转换 秒单位
---to_unix_timestamp 将时间转为long型
select to_unix_timestamp(current_timestamp());
----转换时区
select to_utc_timestamp(`current_timestamp`(),'GMT+09:00');
---将long的秒数转换为format的时间格式 1970-05-23
select from_unixtime(12323243,'yyyy-MM-dd');取整相关函数
----ceil(num) 向上取整
select ceil(5.1);
----floor(num) 向下取整
select `floor`(5.1);
----round(num) 四舍五入
select round(5.4);
复杂类型包装函数
---str_to_map(str,field_sep,kv_sep)
---返回str按照field_sep分段,每段kv_sep分成key value 返回一共map
select str_to_map('a:b,c:d,e:f',",",":");
select str_to_map('a:b,c:d,e:f',",",":")["a"]; ---调用a返回b
---named struct(name1,col1,name2,col2.....)
---将col1,col2 包装为结构体,名字为name1,name2
select named_struct("name",name,"orderdate",orderdate,"cost",cost) from business; ---将复杂表格弄成一列
select collect_list(named_struct("name",name,"orderdate",orderdate,"cost",cost)) from business; ---将复杂表格弄成一列,加上聚合函数捏成数组成一列 绑定行与行自定义函数
·1 Hive自带了一些函数,比如:max/min等,但是数量有限,自己可以通过自定义UDF来方便的扩展。
·2 当Hive提供的内置函数无法满足你的业务处理需要时,此时就可以考虑使用用户自定义函数(UDF:user-defined function)。
·3 根据用户自定义函数类别分为以下三种:
(1)UDF(User-Defined-Function)
一进一出。
(2)UDAF(User-Defined Aggregation Function)
用户自定义聚合函数,多进一出。
类似于:count/max/min
(3)UDTF(User-Defined Table-Generating Functions)
用户自定义表生成函数,一进多出。
如lateral view explode()
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.Arrays;
/***
* 实现一个返回字符串长度的函数
*/
public class MyUDF extends GenericUDF {
/***
* 验证函数输入参数和个数和类型
* @param objectInspectors
* @return
* @throws UDFArgumentException
*/
@Override
public ObjectInspector initialize(ObjectInspector[] objectInspectors) throws UDFArgumentException {
//首先确认参数的个数
if (objectInspectors.length != 1){
throw new UDFArgumentLengthException("参数只能为一共");
}
if ("string".equals(objectInspectors[0].getTypeName())){
throw new UDFArgumentTypeException(0,"类型必须为string");
}
//返回我们要输出的元数据
return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
}
/***
* 主要在这
* 计算逻辑根据输入参数,计算结果
* 接收string 返回int
* @param deferredObjects
* @return
* @throws HiveException
*/
@Override
public Object evaluate(DeferredObject[] deferredObjects) throws HiveException {
//获取string
Object o = deferredObjects[0].get();
String line = o.toString();
//返回string的长度
return line.length();
}
/***
* 显示错误代码 这里没什么用
* @param strings
* @return
*/
@Override
public String getDisplayString(String[] strings) {
return Arrays.toString(strings);
}
}
6自定义的jar包使用
·将jar包打包上传到hive/lib上
·重启hiveserver2或者在客户端添加jar包
两种方法都可以
hive_services.sh restart
./hiveserver2 restart在beeline种执行以下进行热添加
add jar /export/server/hive/lib/hiveUdf-1.0-SNAPSHOT.jar;在hive种创建函数与jar进行关联
add jar /export/server/hive/lib/hiveUdf-1.0-SNAPSHOT.jar;
create function mylen as 'MyUDF';
运行测试
select mylen("zq");
如果不想要
drop function mylen;分区与分桶
---分区表的好处 (海量数据) (生产环境下按照时间分区)
---1.一般数据量很大的表格,我们要按照分区存储
---2.按照分区字段过滤时,可以减小数据扫描量
---3.一般来说,在生产环境种,只要是分区表,尽量不要做全表扫描
---分区表就是把一张大表分成若干个文件夹进行管理
---创建分区表
create table dept_partition(
deptno int,
dname string,
loc int
)
partitioned by (dt string)
row format delimited fields terminated by "\t";
---向分区表插入数据(load)
load data local inpath "/export/wendaangtxt/dept.txt"
into table dept_partition partition (dt="2022-6-14")
---插入数据 (insert居多)
---覆盖插入
insert overwrite table dept_partition partition (dt="2022-06-17")
select deptno,dname,loc from dept;
---另外的写法
insert overwrite table dept_partition
select deptno,dname,loc,'2022-06-18' from dept;
---查询分区表分区数
show partitions dept_partition;
---分区过滤查询
select * from dept_partition where dt ="2022-6-14";
---直接对分区表的分区进行操作
--直接添加分区
alter table dept_partition add partition (dt='2022-6-19');
---这是添加两个分区
alter table dept_partition add partition (dt='2022-6-20') partition (dt='2022-6-21');
---删除分区
alter table dept_partition drop partition (dt="2022-6-19");
alter table dept_partition drop partition(dt="2022-6-20"),partition (dt="2022-6-21");
---思考下列操作
--直接在分区表文件夹中新建对应分区
hadoop fs -cp /user/hive/warehouse/text.db/dept_partition/dt=2022-06-18
/user/hive/warehouse/text.db/dept_partition/dt=2022-06-13
---问表里面能不能看见13号的数据
!!!!!!没有
show partitions dept_partition; ---没有!!!
---直接在分区表数据目录里面建立文件夹,分区表不能直接识别(因为有独立的元数据)
---方案一
alter table add partition
---方案二
---官方修复分区表数据
msck repair table dept_partition;
//扫描目录 然后进行补上二级分区表
---二级分区表
create table dept_partition2(
deptno int,
dname string,
loc int
)
partitioned by (month string,day string)
row format delimited fields terminated by "\t";
---插入数据
insert into dept_partition2
select deptno,dname,loc,'2022-06','14' from dept;
---load 指定分区的时候需要指定所有级别的分区
load data local inpath "/export/wendaangtxt/dept.txt"
into table dept_partition2 partition (month="2022-06",day='15');动态分区
---动态分区
---普通的表格无法直接转换为分区表
---如果想转换的话,只能先建立新的分区表,在将旧数据插入新的分区表
---列: 将emp表转换为deptno分区的表格
create table emp_par(
empno int,
ename string,
job string,
salary decimal(16,2)
)
partitioned by (deptno int)
row format delimited fields terminated by "\t";
---将输入插入这张分区表
---方法A,一个分区一个分区插进去
insert into emp_par partition (deptno=10)
select empno,ename,job,sal from emp_par where deptno =10;
---方法B 省事的办法 动态分区 一次搞定
insert into emp_par
select empno,ename,job,sal,deptno from emp;
---动态分区需要注意自动生成的分区数是有上限的
hive.exec.max.dynamic.partitions=1000 ---全局动态分区上限
hive.exec.max.dynamic.partitions.pernode=100 ----每个节点的上限
hive.exec.max.created.files=100000 ----创建的最大文件数据
hive.error.on.empty.partition=false ---空分区抛出异常分桶表
---数据准备
create table student2(
id int,
name string
)
row format delimited fields terminated by "\t";
select * from student2;
1001,ss1
1002,ss2
1003,ss3
1004,ss4
1005,ss5
1006,ss6
1007,ss7
1008,ss8
1009,ss9
1010,ss10
1011,ss11
1012,ss12
1013,ss13
1014,ss14
1015,ss15
1016,ss16
---向分桶表 插入数据 需要走mp 最好不要用locd 如果要用locd 也要用hdfs来输入文件
---locd
---由于分桶表插入数据要跑mr,有可能跑mapTsk的本地节点没数据 有可能node1 node2 node3 有点随机了做不到跨界点取数据的
load data inpath "/datas/student.txt" into table stu_buck; ---最好不用
---思考 效率有什么区别
select * from stu_buck where id = 1003; ---只用扫一遍
select * from stu_buck where name = "zhangsan"; ---把zhangsan找到 全部扫描
---分桶类似起到索引的作用 能一定程度加快查询
---用insert
insert overwrite table stu_buck
select * from student2;
---分区和分桶的区别
----分区的一列 并不是所有的列可以用来分区
----而有些列 适合分桶类似主键的来分桶
----分区是文件夹,分桶是文件
----分桶后会提高大表 join 速度
---即分区又分桶的
create table stu_par_buck(
id int,
name string
)
partitioned by (dt string)
clustered by (id) sorted by (id desc ) into 4 buckets
row format delimited fields terminated by "\t";
---插入数据
insert into stu_par_buck
select id,name,'2022-06-14' from student2;
列式存储简介
压缩和存储
1.hive中存储格式
通过建表语句来指定 建表格式
Hive支持的存储数据的格式主要有:TEXTFILE 、SEQUENCEFILE、ORC、PARQUET。
前面两个叫行式存储,后面两个叫列式存储
(企业里主要使用ORC压缩) SNAPPY 来存储 不为别的因为快
create database CSYB;
use CSYB;
---未压缩的文本格式
create table bigtable(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string
)
row format delimited fields terminated by '\t';
----准备ORC格式
create table bigtable_orc(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties ("orc.compress"="NONE");
---创建一张parquet存储格式未压缩的表格
create table bigtable_parquet(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string
)
row format delimited fields terminated by '\t'
stored as parquet ;
-----压缩
----准备ORC格式zlib压缩
create table bigtable_orc_zlib(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties ("orc.compress"="ZLIB");
----准备ORC格式snappy压缩
create table bigtable_orc_snappy(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties ("orc.compress"="SNAPPY");
---创建一张parquet存储格式GZIP压缩的表格
create table bigtable_parquet_GZIP(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string
)
row format delimited fields terminated by '\t'
stored as parquet
tblproperties ("parquet.compression"="GZIP");
---创建一张parquet存储格式snappy压缩的表格
create table bigtable_parquet_snappy(
id bigint,
t bigint,
uid string,
keyword string,
url_rank int,
click_num int,
click_url string
)
row format delimited fields terminated by '\t'
stored as parquet
tblproperties ("parquet.compression"="SNAPPY");
load data local inpath '/export/wentext/bigtable'into table bigtable;
insert into bigtable_orc select * from bigtable;
insert into bigtable_orc_snappy select * from bigtable;
insert into bigtable_orc_zlib select * from bigtable;
insert into bigtable_parquet select * from bigtable;
insert into bigtable_parquet_GZIP select * from bigtable;
insert into bigtable_parquet_snappy select * from bigtable;
select * from bigtable_orc_snappy;企业级调优
explain select * from student; (查看执行计划)
1.Fetch Operator (直接取数据,不用mapreduce)
1.limit (取几行数据)
2.Select Operator (查询数据)
1.expressions (查询的列数)
2.Statistics (统计信息 =文件行数-文件大小 等)
3.Filter Operator
1.predicate (过滤条件)
4.map join Operator (执行Maojoin)
5.File Output Operator (文件输出结果格式)
1.compressed (压缩格式)
2.Statistics (统计数据)
3.table (最终输出格式)
6.Group BY Operator (分组操作)
1.aggregations (执行的具体操作)
2.Keys (group by的key)
3.mode (分组方法)
7.Reduce Output Operator (汇总操作,一般指Combiner)
1.key exoressions (reduce的key)
2.value exporessions (reduce的value)
3.sort order (升序还是降序)
4.Map-reduce partition columns (分区的key)
8.PTF Operator (开窗操作)
1.partition by (分组的列)
2.order by (排序的列)
3.window functions (开窗以后执行的操作)
hive优化
列裁剪:手动指定查询的列
Group By:并不是所有的聚合操作都需要在Reduce端完成,很多聚合操作都可以先在Map端进行部分聚合,最后在Reduce端得出最终结果。
(1)是否在Map端进行聚合,默认为True
set hive.map.aggr = true;
(2)在Map端进行聚合操作的条目数目
set hive.groupby.mapaggr.checkinterval = 100000;
(3)有数据倾斜的时候进行负载均衡(默认是false)
set hive.groupby.skewindata = true;
CBO优化: Hive 自 0.14.0 开始,加入了一项 "Cost based Optimizer" 来对HQL执行计划进行优化,这个功能通过"hive.cbo.enable" 来开启。在Hive 1.1.0之后,这个feature是默认开启的,它可以自动优化HQL中多个Join的顺序,并选择合适的Join算法。
谓词(过滤)下推: 将SQL语句中的where谓词逻辑都尽可能提前执行,减少下游处理的数据量。对应逻辑优化器是PredicatePushDown,配置项为hive.optimize.ppd,默认为true。
set hive.optimize.ppd = true; #谓词下推,默认是true
MapJoin:MapJoin是将Join双方比较小的表直接分发到各个Map进程的内存中,在Map进程中进行Join操作,这样就不用进行Reduce步骤,从而提高了速度。
(1)设置自动选择MapJoin
set hive.auto.convert.join=true; #默认为true
(2)大表小表的阈值设置(默认25M以下认为是小表):
set hive.mapjoin.smalltable.filesize=25000000;
大表、大表SMB Join(重点):速度会非常的慢
分桶表的
set hive.optimize.bucketmapjoin = true;
set hive.optimize.bucketmapjoin.sortedmerge = true;
set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
笛卡尔积: Join的时候不加on条件,或者无效的on条件,因为找不到Join key,Hive只能使用1个Reducer来完成笛卡尔积。当Hive设定为严格模式(hive.mapred.mode=strict,nonstrict)时,不允许在HQL语句中出现笛卡尔积。 小心sql的数据爆炸
还有其他优化:这里就不备注了 需要可自行查找资料
hive高级调优 很好的思路 建议反复观看
hive查询练习题1
create database student;
use student;
create table student (
s_id string,
s_name string,
s_birth string,
s_sex string
)
row format delimited fields terminated by ',';
create table course (
c_id string,
c_name string,
t_id string
)
row format delimited fields terminated by ',';
create table teacher (
t_id string,
t_name string
)
row format delimited fields terminated by ','
create table score (
s_id string,
c_id string,
s_score int
)
row format delimited fields terminated by ',';
----以上是数据准备 数据在文件夹里
---1.查询“01” 课程比“02”课程成绩高的信息及分数、
with t1 as (select s_id,c_id,s_score from score where c_id="1"),
t2 as (select s_id,c_id,s_score from score where c_id="2")
select t1.s_id,
s.s_name,
s.s_birth,
s.s_sex,
t1.s_score,
t2.s_score
from t1 join t2 on t1.s_id=t2.s_id
join student s on t1.s_id = s.s_id
where t1.s_score>t2.s_score;
---如果想查询所有成绩
with t1 as ( select s_id,str_to_map(concat_ws(":",collect_list(concat(c_id,",",s_score))),":",",") scores
from score
group by s_id
)
select s.s_id,
s.s_name,
s.s_birth,
s.s_sex,
t1.scores['1'] class01,
t1.scores['2'] class02,
t1.scores['3'] class03
from t1 join student s
on t1.s_id = s.s_id
where t1.scores['1'] > t1.scores['2'];
---2.查询没有学全所有课程的同学的信息
select s.s_id,
s.s_name,
s.s_birth,
s.s_sex
from student s left join
---首先查询学生选了几门课
(select s.s_id
from score s
group by s.s_id
having count(s.c_id) = (select count(c_id) from course)
)t1 on s.s_id=t1.s_id
where t1.s_id is null; ---left join 以后 t1.s_id 是null的都是没选全的
---3.查询至少有一门课与学号为“01”的同学所学相同的同学的信息
---首先查询01学生选了几门课
select distinct st.s_id,
st.s_name,
st.s_birth,
st.s_sex
from score s join
(
---首先查询01学生选了几门课
select c_id
from score
where s_id = "1"
)t1
on s.c_id = t1.c_id
join student st on s.s_id=st.s_id
where s.s_id<>'1';
---4.按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩
---简单方式:开窗求平均
select s_id,c_id,s_score,avg(s_score) over(partition by s_id) avg_score
from score
order by avg_score desc;
---如果希望做得漂亮点
with t1 as ( select s_id,str_to_map(concat_ws(":",collect_list(concat(c_id,",",s_score))),":",",") scores,
avg(s_score) avg_score
from score
group by s_id
)
select s_id,
t1.scores['1'] class01,
t1.scores['2'] class02,
t1.scores['3'] class03,
avg_score
from t1
order by avg_score desc;
---5.查询各科成绩最高分,最低分和平均分,以如下形式显示:课程id 课程name 最高分 最低分 平均分 及格率 中等率 优品率
--- 优秀率: 及格为>=60 中等为:70-80 优良为:80-90 优秀为:>90
select c.c_id,
c.c_name,
max(s_score) max_score,
min(s_score) min_score,
avg(s_score) avg_score,
count(`if`(s_score>=60,1,null)) / count(1) pass_rete,
count(`if`(s_score>=70 and s_score <80,1,null)) / count(1) mide_ret,
count(`if`(s_score>=80 and s_score <90,1,null)) / count(1) good_ret,
count(`if`(s_score>=90,1,null)) / count(1) exce_ret
from score s
join course c on s.c_id = c.c_id
group by c.c_id,c.c_name;
---6.各科成绩排名
select c_id,s_id,s_score,rank() over (partition by c_id order by s_score desc) rnk
from score;
---7查询 所有课程的成绩2名到第3名的学生信息及课程成绩
select s.*,
t1.c_id,
t1.s_score
from (select c_id,s_id,s_score,rank() over (partition by c_id order by s_score desc) rnk
from score)t1
join student s on t1.s_id=s.s_id
where rnk between 2 and 3;
---思考一下
---8.查询 任何一门课程成绩在70分以上的学生姓名 课程名称和分数
select st.*,
s.s_id,
s.s_score
from (
---首先任何一门课程成绩在70分以上
select s_id
from score
group by s_id
having min(s_score) >= 70
)t1 join score s on t1.s_id=s.s_id join student st on t1.s_id=st.s_id
---断句方式 任意一门课程在70分以上的学生
select st.*,
s.s_id,
s.s_score
from (
---首先任何一门课程成绩在70分以上
select s_id
from score
group by s_id
having max(s_score) >= 70
)t1 join score s on t1.s_id=s.s_id join student st on t1.s_id=st.s_id;
---9。查询选修了全部课程的学生信息:
select s.s_id,
s.s_name,
s.s_birth,
s.s_sex
from student s join
---首先查询学生选了几门课
(select s.s_id
from score s
group by s.s_id
having count(s.c_id) = (select count(c_id) from course)
)t1 on s.s_id=t1.s_id;
---10.查询各学生的年龄(周岁)
---简单粗暴的方法
select s_id,`floor`(datediff(`current_date`(),s_birth) / 365) age
from student;
---正确写法 要按照年月日计算 判断他今年过没过生日 如果过了
select s_id,
`if`(
----过没过生日
month(`current_date`()) > month(s_birth) or
(month(`current_date`()) = month(s_birth) and day(`current_date`())>day(s_birth)),
year(`current_date`()) - year(s_birth), ---过生日岁数
year(`current_date`()) - year(s_birth) -1 ---没过生日的岁数
) age
from studenthive练习题2
create database cs;
use cs;
create external table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>
)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile
location '/gulivideo/video';
select * from gulivideo_ori;
create external table gulivideo_user_ori(
uploader string,
videos int,
friends int
)
row format delimited
fields terminated by "\t"
stored as textfile
location '/gulivideo/user';
select * from gulivideo_user_ori;
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>
)
stored as orc
tblproperties("orc.compress"="SNAPPY");
create table gulivideo_user_orc(
uploader string,
videos int,
friends int
)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY")
insert into table gulivideo_orc
select * from gulivideo_ori;
insert into table gulivideo_user_orc
select * from gulivideo_user_ori;
---统计视频观看数Top10
select videoId,views from gulivideo_orc order by views desc limit 10;
-- 统计视频类别热度Top10(类别热度:类别下的总视频数)
select category,count(category) from gulivideo_orc;
-- 1.将类别列炸开
-- 列转行使用 lateral view explode
-- 2.按照类别分组 在按照count过后的值排序
select
category_name,
count(t1.category_name) c1
from (select category_name
from gulivideo_orc
lateral view explode (category) category as category_name
)t1
group by category_name
order by c1 desc
limit 10;
-- 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
--1.求出 视频观看数最高的20个视频的所属类别
--2.将所属类别炸开
select t2.category_name,
count(*) category_count
from(
select videoid,category_name
from
(select
videoid,
category,
views
from gulivideo_orc
order by views desc
limit 20
)t1
lateral view explode(category) tmp as category_name
)t2
group by t2.category_name
-- 统计视频观看数Top50所关联视频的所属类别Rank(每个类别下有多少视频)
-- 1.求出视频观看数 Top50的视频所关联的视频(数组)
select
relatedId,
views
from gulivideo_orc
order by views desc
limit 50;t1
--2. 将关联视频炸开
select
explode(relatedId) related_id
from t1;t2
--3.JOIN原表,取出关联视频的类别(数组)
select
g.category
from t2
join gulivideo_orc g
on t2.related_id = g.videoId;t3
--4.炸裂类别字段
select
explode(category) category_name
from t3;t4
--5.按照类别分组,求count(),并按照count排序
select
category_name,
count(*) c1
from
t4
group by category_name
order by c1 desc;
---最终sql
select
category_name,
count(*) c1
from
(select
explode(category) category_name
from (select
g.category
from (select
explode(relatedId) related_id
from (select
relatedId,
views
from gulivideo_orc
order by views desc
limit 50)t1)t2
join gulivideo_orc g
on t2.related_id = g.videoId)t3)t4
group by category_name
order by c1 desc;
-- 统计每个类别中的视频热度Top10,以Music
select
t1.videoid,
t1.category_name,
t1.VIEWS
from
(
select
videoid,
category_name,
VIEWS
from gulivideo_orc
lateral view explode(category) tmp as category_name
) t1
where t1.category_name = "Music"
order by t1.views desc
limit 10;
-- 统计每个类别视频观看数Top3
--求出各个类别排名
select
t2.category_name,
t2.videoid,
t2.views,
t2.rk
from
(
select
t1.category_name,
t1.videoid,
t1.views,
rank() over(partition by t1.category_name order by t1.views desc) rk
from (
select
videoid,
category_name,
VIEWS
from gulivideo_orc
lateral view explode(category) tmp as category_name
)t1
) t2
where rk <=3;
-- 统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
select
t3.uploader,
t3.videoid,
t3.views,
t3.rk
from
(
select
t1.uploader,
t2.videoid,
t2.views,
rank() over(partition by t1.uploader order by t2.views desc) rk
from
(
select
uploader,
videos
from gulivideo_user_orc
order by videos desc
limit 10
) t1
join
gulivideo_orc t2
on t1.uploader = t2.uploader
) t3
where t3.rk <= 20;