一、基本概念

解决海量结构化日志的数据统计
基于Hadoop的数据仓库工具，可以将结构化的数据文件映射为一张表
本质：将HQL转化成MapReduce程序
Hive处理的数据存储在HDFS上，默认实现MapReduce（可以替换成spark），执行程序在Yarn上
可以理解为，hive是hadoop的客户端
架构：
- 客户端：CLI、JDBC、Driver
- 元数据Meta store：包括表名、表所属的数据库、表拥有者等
- 驱动器：SQL Parser解析器 -> Physical Plan编译器 -> Query Optimizer优化器 -> Execution执行器
- Hadoop：MapReduce，HDFS

二、安装

这里强烈建议和尚老师的版本一样，顺便把原来的hadoop环境改成了2.7.2。。。。
不一定要把hive分发到各个机器上
自己配版本很容易会有一堆jar包冲突的问题。

# 在software文件下解压
tar -zxvf apache-hive-1.2.1-bin.tar.gz -C /opt/module
# 解压mysql
unzip mysql-libs.zip
# 给hive改个名字，因为太长了
mv apache-hive-1.2.1-bin/ hive
# 配置hadoop路径，在hive/conf里面
mv hive-env.sh.template hive-env.sh
sudo vim hive-env.sh
export JAVA_HOME=/opt/module/jdk1.8.0_151
export HIVE_HOME=/opt/module/hive
export HADOOP_HOME=/opt/module/hadoop-3.1.3
export HIVE_CONF_DIR=/opt/module/hive/conf
# 为啥我启动后没有derby.log和metastore_db

启动hive

1 2	# 在hive的文件夹里面 bin/hive

配置mysql，需要切换到root用户。原来的derby只允许一个用户接入

# 查询是否有安装mysql，有的话就删掉
rpm -qa | grep -i mysql
# 第二种方法就是
fing / -name mysql
# 安装mysql
rpm -e --nodeps xxx
# centos8要加--nodeps
rpm -ivh MySQL-client-5.6.24-1.el6.x86_64.rpm --nodeps
# mysql状态
service mysql status
# mysql启动
service mysql start

这里我启动mysql时报错：Starting MySQL. ERROR! The server quit without updating PID file

# 在/etc/init.d的mysql文件夹中，添加 --user=root，就ok了
case "$mode" in
  'start')
    # Start daemon

    # Safeguard (relative paths, core dumps..)
    cd $basedir

    echo $echo_n "Starting MySQL"
    if test -x $bindir/mysqld_safe
    then
      # Give extra arguments to mysqld with the my.cnf file. This script
      # may be overwritten at next upgrade.
      # 在
      # 这
      # 里
      $bindir/mysqld_safe --user=root --datadir="$datadir" --pid-file="$mysqld_pid_file_path" $other_args >/dev/null 2>&1 &

打开mysql时出错，mysql: error while loading shared libraries: libncurses.so.5: cannot open shared object file: No such file or directory，输入yum install libncurses*解决，参考风的低吟
启动成功后

# 登录，密码在/etc/.mysql_serect里面
mysql -uroot -pXXXX
# 修改密码
SET PASSWORD=PASSWORD('123');
# 修改user表
2．显示数据库
mysql>show databases;
3．使用mysql数据库
mysql>use mysql;
4．展示mysql数据库中的所有表
mysql>show tables;
5．展示user表的结构
mysql>desc user;
6．查询user表
mysql>select User, Host, Password from user;
7．修改user表，把Host表内容修改为%
mysql>update user set host='%' where host='localhost';
8．删除root用户的其他host
mysql>delete from user where Host='hadoop101';
mysql>delete from user where Host='127.0.0.1';
mysql>delete from user where Host='::1';
# 解压connector的tar包后
[root@hadoop101 mysql-connector-java-5.1.27]# cp mysql-connector-java-5.1.27-bin.jar /opt/module/hive/lib/
# 在/opt/module/hive/conf/下创建xml文件
touch hive-site.xml
# 添加以下内容
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
	<property>
	  <name>javax.jdo.option.ConnectionURL</name>
	  <value>jdbc:mysql://hadoop101:3306/metastore?createDatabaseIfNotExist=true</value>
	  <description>JDBC connect string for a JDBC metastore</description>
	</property>

	<property>
	  <name>javax.jdo.option.ConnectionDriverName</name>
	  <value>com.mysql.jdbc.Driver</value>
	  <description>Driver class name for a JDBC metastore</description>
	</property>

	<property>
	  <name>javax.jdo.option.ConnectionUserName</name>
	  <value>root</value>
	  <description>username to use against metastore database</description>
	</property>

	<property>
	  <name>javax.jdo.option.ConnectionPassword</name>
	  <value>123</value>
	  <description>password to use against metastore database</description>
	</property>
</configuration>

三、学习笔记

load data local inpath '/opt/module/data/stu.txt' into table stu;
# 其实是等价于下面这条
hadoop fs -put stu.txt /user/hive/warehouse/stu
# 注意，hadoop put是复制一份
# hive的load是将hdfs的路径改了
load data inpath 'xxxx' into table xxx;

对hive参数的修改：

# 命令行修改参数：这种修改方式只对本次启动的hive生效
bin/hive -hiveconf mapred.reduce.tasks=10;
# 也可以在HQL里面改
set mapred.reduce.tasks=100;
# 查看参数
set mapred.reduce.tasks;

hive三种复杂的数据类型：

STRUCT	MAP	ARRAY
与C语言中的Struct相似	`<string, int>`，KV对	相同类型的数组

# 创建一个test表格
create table test(
name string,
friends array<string>,
children map<string, int>,
address struct<street:string, city:string>
)
row format delimited fields terminated by ','
collection items terminated by '_'
map keys terminated by ':'
lines terminated by '\n';
# 插入数据
select friends[1],children['xxx'],address.city from test;

DDL数据定义，这里和Mysql语法相似

# 创建数据库，还可以在后面指定数据库位置：location '/xxx.db';
create database if not exists xxx;
# 显示数据库详细信息
desc database extended xxx;
# 删除，cascade是强制删除
drop database if exists xxx [cascade];

外部表和内部表：前者，删除时不会删实际数据，只删元数据；

# 创建表
create [external] table if not exists xxx;
# 修改表为外部表，内部表为FALSE，一定要大写
alter table xxx set tblproperties('EXTERNAL'='TRUE');

分区表：就是个文件夹，原来的表是一个文件夹，现在将一张表分到多个文件夹里面

# 创建分区表，按month分区
create table dept_partition(xxx) partitioned by (month string)
# 单个分区查询
select * from dept_partition where month='xxx';
# 多个分区查询
select * from dept_partition where month='xxx'
union select * from dept_partition where month='xxx'
union select * from dept_partition where month='xxx';
# 增加多个分区
alter table dept_partition add[drop] partition(month='xxx') partition(month='yyy');
# 查看有多少个分区
show partitions dept_partition
# 二级分区，可以分多个区嵌套
create table dept_partition2() partitioned by (month string, day string)
# 上传数据的方式
# 先上传数据后修复元数据
dfs -put /xxx /xxx;
msck repair table dept_partition2;
# 上传后添加分区
dfs -mkdir -p /xxx;
dfs -put /xxx /xxx;
alter table dept_partition2 add partition(month='', day='');
# 创建文件夹后load到分区
dfs -mkdir -p
load data local inpath '' into table dept_partition2 partition(month='',day='');

修改表

# 修改表名
alter table dept_partition2 rename to dept_partition3
# 增加列
ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type [COMMENT col_comment] [FIRST|AFTER column_name]

DML数据操作

# 加载数据：load，overwrite是覆盖已有数据
load data [local] inpath '' [overwrite] into table xxx [partition (x=y,...)]
# 插入：insert
insert into table xxx partition(month='xx') values(xxx, yyy)
# as select会将select出来的数据放到新表student3中
create table if not exists student3 as select id, name from student;
# create表时可以指定位置
create table if not exists student() location '/xxx';
# 注意import数据一定要先export
# insert也可以导出数据
insert overwrite [local] directory '/xxx'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
select * from student;
# hadoop导出
dfs -get /xxx /xxx;
# shell导出
bin/hive -e 'select * from database.table' > /xxx;
# export到hdfs上
export table database.table to '/xxx';
# 还有sqoop导出
# 清除表中数据，truncate只删管理表，不删外部表
truncate table xxx;

查询，这个可以查官方文档

# select语法，select_expr后面可以加：AS 别名
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
  FROM table_reference
  [WHERE where_condition]
  [GROUP BY col_list]
  [ORDER BY col_list]
  [CLUSTER BY col_list
    | [DISTRIBUTE BY col_list] [SORT BY col_list]
  ]
 [LIMIT number]

比较运算符

# 否定前面可以加NOT
Between B and C
IS NULL
IN(a,b,c)
# 简单的匹配，B是简单正则表达式：x%、%x、%x%、x_、_x、_x_
A LIKE B
# 复杂正则表达式
A RLIKE B
A REGEXP B

逻辑运算符

1
2
3

AND 
OR
NOT

分组

# group by，按照group by的内容分别select
select * table from emp t group by t.xxx;
# having与where相似，但前者可以使用分组函数，后者不能
# having针对查询结果中的列发挥作用，where对表中的列发挥作用
# having只用于group by
# 下面的语句，having只对avg_sal起作用
select avg(sal) avg_sal from emp group by deptno having avg_sal > 2000;

连接
排序

# Order by：全局排序，只有一个Reducer
select * from emp order by sal [desc];

# Sort by：对每个Reducer内的数据进行排序，不一定是全局排序
# 下面设置三个Reducer，在reducer里面排序
set mapreduce.job.reduces=3;
select * from emp sort by empno desc;

# Distribute by：先进行分区，在区内排序，这个和sort by不一样
# 一般结合sort by使用
set mapreduce.job.reduces=3;
select * from emp distribute by deptno sort by empno desc;

# cluster by：distribute by和sorts by字段相同的情况
# cluster by只是升序，不能降序

分桶：分区针对的是数据的存储路径；分桶针对的是数据文件。

# 这里根据id分成了四个桶，分区partition的关键词month、day不在表里
# 设置分桶时，要设置属性
set hive.enforce.bucketing=true;
set mapreduce.job.reduces=-1;
create table stu_buck(id int, name string) clustered by(id) into 4 buckets
row format delimited fields terminated by '\t';
# 分桶抽样：y是bucket数量的倍数或因子，根据y决定抽样比例
# x表示从哪个bucket开始抽样
select * from stu_buck tablesample(bucket x out of y on id);

常用查询函数

# 空字段赋值：将为NULL的string替换成replace_with
nvl(string, replace_with)
# case when：这个和decode一样吧
case sex when '男' then 1 else 0 end 
decode(col_name, k1, v1, k2, v2...)

行转列、列转行

# 行转列
concat：拼接，concat('a','b','c')的结果是'abc'
concat_ws：拼接，concat('-','a','b','c')的结果是'a-b-c'
concat_set：拼接，类似unique()
# 列转行
explode：展开，把一列展开成多多个行，比如a列如下
a
x,x
y,y
z,z
explode(a)的结果是：
x
x
y
y
z
z
# lateral view：侧写，相当于建立一个虚拟表，与udtf函数一起使用
# lateral view可以多个一起使用，这里的myTable1、myTable2是虚拟表的名称
SELECT myCol1, myCol2 FROM baseTable
    LATERAL VIEW explode(col1) myTable1 AS myCol1
    LATERAL VIEW explode(col2) myTable2 AS myCol2;

窗口函数

over函数对每条数据都开窗口

# 以一个列子来说明
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,42
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94

# 不加over的情况下：
select name,count(*) from business
where substring(orderdate, 1, 7)='2017-04' group by name;
jack	1
mart	4
# 加了over()的情况下：这个over相当于
# 先select * from business where xxxxx group by name;
# 看当前有两行，over对这两行进行计算count
select name,count(*) over () from business 
where substring(orderdate,1,7) = '2017-04' group by name;
mart	2
jack	2

# 查询顾客的 购买明细 及 购买总额
select *,sum(cost) over() from business;

# over()里面group by只能用于第一条数据到现在的数据，这一种情况
select *,sum(cost) over(distribute by name sort by orderdate) from business;

# 查询顾客的 购买明细 及 月购买总额
select *,sum(cost) over(partition by month(orderdate)) 
from business;

排名函数

# rank

函数

# 内置函数
hive> show functions;
# 显示函数的用法
hive> desc function extended xxxx