Contents

环境部署

Contents

ref

持续更新中,始于2019年9月19日,目前已经更新至hadoop3系列

0.集群规划

HostName IP CPU RAM DISK OS ENV DIR
hadoop161 192.168.8.161 4 8G 40G CentOS 7.6 /opt/module/
hadoop162 192.168.8.162 4 8G 40G CentOS 7.6 /opt/module/
hadoop163 192.168.8.163 4 8G 40G CentOS 7.6 /opt/module/

所有机器关闭SELinux,关闭firewalld,启用chronyd时间同步,环境变量统一配置在家目录下,.bashrc调用/opt/module/env.sh;

配置原因见:https://forsre.cn/pages/linux%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F/

1.JDK

1.1 JDK 下载解压

1
2
3
4
wget https://repo.huaweicloud.com/java/jdk/8u202-b08/jdk-8u202-linux-x64.tar.gz
#md5sum jdk-8u202-linux-x64.tar.gz 
#0029351f7a946f6c05b582100c7d45b7  jdk-8u202-linux-x64.tar.gz
tar zxvf jdk-8u202-linux-x64.tar.gz -C /opt/module/

1.2 环境变量

1
2
3
4
5
6
7
tee /opt/module/env.sh <<-'EOF'
#!/bin/bash

#JAVA_HOME
export JAVA_HOME=/opt/module/jdk1.8.0_202
export PATH=$PATH:$JAVA_HOME/bin
EOF

1.3 分发脚本

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#bin/xsync 
#!/bin/bash

#1. 判断参数个数
if [ $# -lt 1 ]
then
  echo Not Enough Arguement!
  exit;
fi
#2. 遍历集群所有机器
for host in hadoop162 hadoop163
do
  echo ====================  $host  ====================
  #3. 遍历所有目录,挨个发送
  for file in $@
  do
    #4 判断文件是否存在
    if [ -e $file ]
    then
      #5. 获取父目录
      pdir=$(cd -P $(dirname $file); pwd)
      #6. 获取当前文件的名称
      fname=$(basename $file)
      ssh $host "mkdir -p $pdir"
      rsync -av $pdir/$fname $host:$pdir
    else
      echo $file does not exists!
    fi
  done
done
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#bin/xscp 
#!/bin/bash

#1. 判断参数个数
if [ $# -lt 1 ]
then
  echo Not Enough Arguement!
  exit;
fi
#2. 遍历集群所有机器
for host in hadoop162 hadoop163
do
  echo ====================  $host  ====================
  #3. 遍历所有目录,挨个发送
  for file in $@
  do
    #4 判断文件是否存在
    if [ -e $file ]
    then
      #5. 获取父目录
      pdir=$(cd -P $(dirname $file); pwd)
      #6. 获取当前文件的名称
      fname=$(basename $file)
      ssh $host "mkdir -p $pdir"
      scp -r $pdir/$fname $host:$pdir
    else
      echo $file does not exists!
    fi
  done
done
1
2
3
4
5
6
7
#xcall 
#!/bin/bash

for i in hadoop161 hadoop162 hadoop163;do
	echo -e "\033[32m ========== $i ========== \033[0m"
	ssh $i "$*"
done

1.4 同步JDK环境变量

1
2
3
4
5
6
chmod +x /opt/module/env.sh
echo "source /opt/module/env.sh" >> .bashrc
xsync /opt/module/jdk1.8.0_202/
xsync /opt/module/env.sh
xsync .bashrc
#or xscp 

2.ZK

2.1 下载解压

1
2
3
4
5
wget http://archive.apache.org/dist/zookeeper/zookeeper-3.5.7/apache-zookeeper-3.5.7-bin.tar.gz
#md5sum apache-zookeeper-3.5.7-bin.tar.gz 
#69e4fe7bbea87d0c97e04ecb725bdc12  apache-zookeeper-3.5.7-bin.tar.gz
tar zxvf apache-zookeeper-3.5.7-bin.tar.gz -C /opt/module/
mv /opt/module/apache-zookeeper-3.5.7-bin /opt/module/zookeeper-3.5.7

2.2 环境变量

1
2
3
4
5
/opt/module/env.sh

#ZOOKEEPER_HOME
export ZOOKEEPER_HOME=/opt/module/zookeeper-3.5.7
export PATH=$PATH:$ZOOKEEPER_HOME/bin

2.3 配置文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
mkdir /opt/module/zookeeper-3.5.7/data

tee /opt/module/zookeeper-3.5.7/data/myid <<-'EOF'
161
EOF

#touch /opt/module/zookeeper-3.5.7/data/myid
#echo 161 > /opt/module/zookeeper-3.5.7/data/myid
mv /opt/module/zookeeper-3.5.7/conf/zoo_sample.cfg /opt/module/zookeeper-3.5.7/conf/zoo.cfg

#vim /opt/module/zookeeper-3.5.7/conf/zoo.cfg

dataDir=/opt/module/zookeeper-3.5.7/data
server.161=hadoop161:2888:3888
server.162=hadoop162:2888:3888
server.163=hadoop163:2888:3888


#Server.A=B:C:D。
#A是一个数字,表示这个是第几号服务器;
#B是这个服务器的IP地址;
#C是这个服务器与集群中的Leader服务器交换信息的端口;
#D是万一集群中的Leader服务器挂了,需要一个端口来重新进行选举,选出一个新的Leader,而这个端口就是用来执行选举时服务器相互通信的端口。

2.4 分发同步

1
2
xsync /opt/module/zookeeper-3.5.7
xsync /opt/module/env.sh

2.5 修改配置

1
2
#修改另外两台的机器myid为162和163
/opt/module/zookeeper-3.5.7/data/myid

2.6 群起脚本

2.6.1 zk.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
#!/bin/bash

if (($#==0))
then
    echo -e "\033[31m start, stop or status \033[0m"
    exit 1;
fi
for i in hadoop161 hadoop162 hadoop163
do
    echo  -e "\033[32m =====================  $1ing zk in $i  =======================\033[0m"
    ssh $i "/opt/module/zookeeper-3.5.7/bin/zkServer.sh $1" 2> /dev/null
done

2.6.2 systemd

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
#sudo vim /etc/systemd/system/zookeeper.service 
[Unit]
Description=ZooKeeper Service
Documentation=http://zookeeper.apache.org
Requires=network.target
After=network.target

[Service]
Type=forking
User=forsre
Group=forsre
Environment=JAVA_HOME=/opt/module/jdk1.8.0_192
ExecStart=/opt/module/zookeeper-3.5.7/bin/zkServer.sh start /opt/module/zookeeper-3.5.7/conf/zoo.cfg
ExecStop=/opt/module/zookeeper-3.5.7/bin/zkServer.sh stop /opt/module/zookeeper-3.5.7/conf/zoo.cfg
ExecReload=/opt/module/zookeeper-3.5.7/bin/zkServer.sh restart /opt/module/zookeeper-3.5.7/conf/zoo.cfg
ExecRestart=/opt/module/zookeeper-3.5.7/bin/zkServer.sh restart /opt/module/zookeeper-3.5.7/conf/zoo.cfg

[Install]
WantedBy=default.target

sudo systemctl enable zookeeper
sudo systemctl start zookeeper

3. Hadoop

Host Hadoop161 Hadoop162 Hadoop163
HDFS NameNode,NameNode DataNode SecondaryNameNode,DataNode
YARN NodeManager ResourceManager ,NodeManager NodeManager

3.1 下载解压

1
2
3
4
wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.3/hadoop-3.1.3.tar.gz
#md5sum hadoop-3.1.3.tar.gz 
#17814b7f952415dc0b3b6d7c3554bae5  hadoop-3.1.3.tar.gz
tar -zxvf hadoop-3.1.3.tar.gz -C /opt/module/

3.2 环境变量

1
2
3
4
5
/opt/module/env.sh
#HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-3.1.3
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin

3.3 配置文件

3.3.1 core-site.xml

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
<configuration>

<!-- 指定NameNode的地址 -->
<property>
        <name>fs.defaultFS</name>
        <value>hdfs://hadoop161:8020</value>
</property>
<!-- 指定hadoop数据的存储目录 -->
<property>
        <name>hadoop.tmp.dir</name>
        <value>/opt/module/hadoop-3.1.3/data</value>
</property>

<!-- 配置HDFS网页登录使用的静态用户为forsre -->
<property>
        <name>hadoop.http.staticuser.user</name>
        <value>forsre</value>
</property>

<!-- 配置该forsre(superUser)允许通过代理访问的主机节点 -->
<property>
        <name>hadoop.proxyuser.forsre.hosts</name>
        <value>*</value>
</property>
<!-- 配置该forsre(superUser)允许通过代理用户所属组 -->
<property>
        <name>hadoop.proxyuser.forsre.groups</name>
        <value>*</value>
</property>
<!-- 配置该forsre(superUser)允许通过代理的用户-->
<property>
        <name>hadoop.proxyuser.forsre.users</name>
        <value>*</value>
</property>
    
</configuration>

3.3.2 hdfs-site.xml

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
<configuration>
    
	<!-- nn web端访问地址-->
    <property>
        <name>dfs.namenode.http-address</name>
        <value>hadoop161:9870</value>
    </property>
    
	<!-- 2nn web端访问地址-->
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>hadoop163:9868</value>
    </property>
    
    <!-- 测试环境指定HDFS副本的数量1 -->
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>
    
</configuration>

3.3.3 yarn-site.xml

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
<configuration>
    
	<!-- 指定MR走shuffle -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    
    <!-- 指定ResourceManager的地址-->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>hadoop162</value>
    </property>
    
    <!-- 环境变量的继承 -->
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
    
    <!-- yarn容器允许分配的最大最小内存 -->
    <property>
        <name>yarn.scheduler.minimum-allocation-mb</name>
        <value>1024</value>
    </property>
    <property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>8196</value>
    </property>
    
    <!-- yarn容器允许管理的物理内存大小 -->
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>8196</value>
    </property>
    
    <!-- 关闭yarn对物理内存和虚拟内存的限制检查 -->
    <property>
        <name>yarn.nodemanager.pmem-check-enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>yarn.nodemanager.vmem-check-enabled</name>
        <value>false</value>
    </property>


<!-- 开启日志聚集功能 -->
<property>
    <name>yarn.log-aggregation-enable</name>
    <value>true</value>
</property>

<!-- 设置日志聚集服务器地址 -->
<property>  
    <name>yarn.log.server.url</name>  
    <value>http://hadoop161:19888/jobhistory/logs</value>
</property>

<!-- 设置日志保留时间为7天 -->
<property>
    <name>yarn.log-aggregation.retain-seconds</name>
    <value>604800</value>
</property>

</configuration>

3.3.4 mapred-site.xml

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
<configuration>
    
	<!-- 指定MapReduce程序运行在Yarn上 -->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>

<!-- 历史服务器端地址 -->
<property>
    <name>mapreduce.jobhistory.address</name>
    <value>hadoop161:10020</value>
</property>

<!-- 历史服务器web端地址 -->
<property>
    <name>mapreduce.jobhistory.webapp.address</name>
    <value>hadoop161:19888</value>
</property>
    
</configuration>

3.3.5 workers

1
2
3
hadoop161
hadoop162
hadoop163

3.3.6 分发同步

1
2
xsync /opt/module/hadoop-3.1.3
xsync /opt/module/env.sh

3.3.7 lzo

3.3.7.1 编译安装
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
#1 安装依赖 yum安装的lzo版本是2.06
yum -y install lzo lzop lzo-devel gcc gcc-c++ zlib-devel autoconf automake libtool
# lzo源码编译如下 lzo版本是2.10
#yum -y install gcc-c++ lzo-devel zlib-devel autoconf automake libtool
#wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.10.tar.gz
#tar -zxvf lzo-2.10.tar.gz
#cd lzo-2.10
#./configure -prefix=/usr/local/hadoop/lzo/
#make
#make install
# 下载源码
git clone https://github.com/twitter/hadoop-lzo.git
#修改pom.xml
#0.4.21-SNAPSHOT to 0.4.21
#hadoop version to 3.1.3
#3 编译
# lzo源码编译需要export两个环境变量
#export C_INCLUDE_PATH=/usr/local/hadoop/lzo/include
#export LIBRARY_PATH=/usr/local/hadoop/lzo/lib 
mvn clean package -Dmaven.test.skip=true -Dmaven.javadoc.skip=true
3.3.7.2 测试
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#1
将编译好后的hadoop-lzo-0.4.21jar 放入hadoop-3.1.3/share/hadoop/common/
#2
xsync hadoop-lzo-0.4.21jar
#3.core-site.xml增加配置支持LZO压缩
<configuration>
    <property>
        <name>io.compression.codecs</name>
        <value>
            org.apache.hadoop.io.compress.GzipCodec,
            org.apache.hadoop.io.compress.DefaultCodec,
            org.apache.hadoop.io.compress.BZip2Codec,
            org.apache.hadoop.io.compress.SnappyCodec,
            com.hadoop.compression.lzo.LzoCodec,
            com.hadoop.compression.lzo.LzopCodec
        </value>
    </property>

    <property>
        <name>io.compression.codec.lzo.class</name>
        <value>com.hadoop.compression.lzo.LzoCodec</value>
    </property>
</configuration>
#4
xysnc core-site.xml

#5 test
hadoop fs -mkdir /input
hadoop fs -put README.txt /input
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar wordcount \
-Dmapreduce.output.fileoutputformat.compress=true \
-Dmapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec  /input /output

#6 index
hadoop fs -mkdir /input
hadoop fs -put bigtable.lzo /input
#split 1
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar wordcount -Dmapreduce.job.inputformat.class=com.hadoop.mapreduce.LzoTextInputFormat /input /output1

hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.21.jar com.hadoop.compression.lzo.DistributedLzoIndexer /input/bigtable.lzo
#split2
hadoop jar /opt/module/hadoop-3.1.3/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.3.jar wordcount -Dmapreduce.job.inputformat.class=com.hadoop.mapreduce.LzoTextInputFormat /input /output2

3.4 启动集群

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#on hadoop161:

#格式化NameNode
bin/hdfs namenode -format

#启动HDFS
sbin/start-dfs.sh

#on hadoop162:
#启动YARN
sbin/start-yarn.sh

#HDFS的Web页面:
http://hadoop161:9870/

#SecondaryNameNode
http://hadoop163:9868/status.html

#2NN页面不能显示完整信息
#1)问题描述
访问2NN页面http://hadoop163:9868,看不到详细信息
#2)解决办法
(1)在浏览器上按F12,查看问题原因。定位bug在61行
(2)找到要修改的文件
#pwd
/opt/module/hadoop-3.1.3/share/hadoop/hdfs/webapps/static/dfs-dust.js
#vim dfs-dust.js
:set nu
#修改61行 
return new Date(Number(v)).toLocaleString();
#分发dfs-dust.js
# xsync dfs-dust.js
# 在http://hadoop104:9868/status.html 页面强制刷新

3.5 群起脚本 hd.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/bin/bash
if [ $# -lt 1 ]
then
   echo -e "\033[31m start, stop or status \033[0m"
    exit 1;
fi
case $1 in
"start")
	   echo -e "\033[32m =================== 启动 hadoop集群 ===================\033[0m"
        echo  -e "\033[32m  --------------- 启动 hdfs --------------- \033[0m"
        ssh hadoop161 "/opt/module/hadoop-3.1.3/sbin/start-dfs.sh"
        echo -e "\033[32m --------------- 启动 yarn ---------------\033[0m"
        ssh hadoop162 "/opt/module/hadoop-3.1.3/sbin/start-yarn.sh"
        echo -e  "\033[32m --------------- 启动 historyserver ---------------\033[0m"
        ssh hadoop161 "/opt/module/hadoop-3.1.3/bin/mapred --daemon start historyserver"
;;
"stop")
        echo -e  "\033[31m  =================== 关闭 hadoop集群 ===================\033[0m"

        echo -e " \033[31m --------------- 关闭 historyserver ---------------\033[0m"
        ssh hadoop161 "/opt/module/hadoop-3.1.3/bin/mapred --daemon stop historyserver"
        echo -e "\033[31m  --------------- 关闭 yarn ---------------\033[0m"
        ssh hadoop162 "/opt/module/hadoop-3.1.3/sbin/stop-yarn.sh"
        echo -e "\033[31m  --------------- 关闭 hdfs ---------------\033[0m"
        ssh hadoop161 "/opt/module/hadoop-3.1.3/sbin/stop-dfs.sh"
;;
"status")
	/bin/bash /home/forsre/bin/jpsall
;;
*)
    echo -e "\033[31m start, stop or status \033[0m"
;;
esac

3.6 jpsall工具

1
2
3
4
5
6
7
#!/bin/bash

for i in hadoop161 hadoop162 hadoop163
do
    echo -e "\033[32m =====================  jps on $i  =======================\033[0m"
    ssh $i "jps | grep -v Jps" 2> /dev/null
done

3.7 Hadoop HA(选配,更多的性能,测试环境慎用)

hadoop161 hadoop162 hadoop163
NameNode NameNode NameNode
ZKFC ZKFC ZKFC
JournalNode JournalNode JournalNode
DataNode DataNode DataNode
ZK ZK ZK
ResourceManager
NodeManager NodeManager NodeManager

HA 工作原理

1.元数据管理方式需要改变: 1.1内存中各自保存一份元数据; 1.2Edits日志只有Active状态的NameNode节点可以做写操作; 1.3两个NameNode都可以读取Edits; 1.4共享的Edits放在一个共享存储中管理(qjournal和NFS两个主流实现) 2.必须保证多NameNode之间能够ssh无密码登录 3.隔离(Fence),即同一时刻仅仅有一个NameNode对外提供服务

3.7.0 安装依赖

1
2
3
4
5
6
7
sudo yum install -y psmisc
#psmisc包含fuser,killall,pstree
#zkfc log
#PATH=$PATH:/sbin:/usr/sbin fuser -v -k -n tcp 8020 via ssh: bash: fuser: command not found
#dfs.ha.fencing.methods
#sshfence:在连接到活动 NameNode 之后杀死进程。
#shell:执行 shell 命令以屏蔽活动的 NameNode。

3.7.1 解压部署

1
2
3
4
5
wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.3/hadoop-3.1.3.tar.gz
#md5sum hadoop-3.1.3.tar.gz 
#17814b7f952415dc0b3b6d7c3554bae5  hadoop-3.1.3.tar.gz
mkdir /opt/module/hadoop-ha
tar -zxvf hadoop-3.1.3.tar.gz -C /opt/module/hadoop-ha

3.7.2 环境变量

1
2
3
4
5
6
/opt/module/env.sh

#HADOOP_HA_HOME
export HADOOP_HOME=/opt/module/hadoop-ha/hadoop-3.1.3
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin

3.7.3 core-site.xml

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
<configuration>
<!-- 把多个NameNode的地址组装成一个集群mycluster -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://mycluster</value>
    </property>	
<!-- 指定hadoop运行时产生文件的存储目录 -->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/opt/module/hadoop-ha/hadoop-3.1.3/data/ha</value>
    </property>
<!-- zk -->
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>hadoop161:2181,hadoop162:2181,hadoop163:2181</value>
</property>
    
 <!-- -->   
 <property>
   <name>ipc.client.connect.max.retries</name>
    <value>20</value>
    <description>
      Indicates the number of retries a clientwill make to establisha server connection.
    </description>
  </property>
 
  <property>
   <name>ipc.client.connect.retry.interval</name>
    <value>5000</value>
    <description>
      Indicates the number of milliseconds aclient will wait for before retrying to establish a server connection.
    </description>
  </property>

</configuration>

3.7.4 hdfs-site.xml

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
<configuration>
	<!-- 完全分布式集群名称 -->
	<property>
		<name>dfs.nameservices</name>
		<value>mycluster</value>
	</property>
  <!-- NameNode数据存储目录 -->
  <property>
        <name>dfs.namenode.name.dir</name>
        <value>${hadoop.tmp.dir}/name</value>
  </property>
 <!-- DataNode数据存储目录 -->
  <property>
        <name>dfs.datanode.data.dir</name>
        <value>${hadoop.tmp.dir}/data</value>
  </property>
  <!--Journalnode数据存储目录-->
	<property>
		<name>dfs.journalnode.edits.dir</name>
		<value>${hadoop.tmp.dir}/jn</value>
	</property> 
	<!-- 集群中NameNode节点都有哪些 -->
	<property>
		<name>dfs.ha.namenodes.mycluster</name>
		<value>nn1,nn2,nn3</value>
	</property>
	<!-- nn1的RPC通信地址 -->
	<property>
		<name>dfs.namenode.rpc-address.mycluster.nn1</name>
		<value>hadoop161:8020</value>
	</property>
	<!-- nn2的RPC通信地址 -->
	<property>
		<name>dfs.namenode.rpc-address.mycluster.nn2</name>
		<value>hadoop162:8020</value>
	</property>
	<!-- nn3的RPC通信地址 -->
	<property>
		<name>dfs.namenode.rpc-address.mycluster.nn3</name>
		<value>hadoop163:8020</value>
	</property>
	<!-- nn1的http通信地址 -->
	<property>
		<name>dfs.namenode.http-address.mycluster.nn1</name>
		<value>hadoop161:9870</value>
	</property>
	<!-- nn2的http通信地址 -->
	<property>
		<name>dfs.namenode.http-address.mycluster.nn2</name>
		<value>hadoop162:9870</value>
	</property>
	<!-- nn3的http通信地址 -->
	<property>
		<name>dfs.namenode.http-address.mycluster.nn3</name>
		<value>hadoop163:9870</value>
	</property>
	<!-- 指定NameNode元数据在JournalNode上的存放位置 -->
	<property>
		<name>dfs.namenode.shared.edits.dir</name>
	<value>qjournal://hadoop161:8485;hadoop162:8485;hadoop163:8485/mycluster</value>
	</property>
	<!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
	<property>
		<name>dfs.ha.fencing.methods</name>
		<value>sshfence</value>
	</property>
	<!-- 使用隔离机制时需要ssh无秘钥登录-->
	<property>
		<name>dfs.ha.fencing.ssh.private-key-files</name>
		<value>/home/forsre/.ssh/id_rsa</value>
	</property>
	<!-- 访问代理类:client用于确定哪个NameNode为Active -->
	<property>		<name>dfs.client.failover.proxy.provider.mycluster</name>
	<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
     </property>
        
    <!--配置故障自动转义-->
     <property>
       <name>dfs.ha.automatic-failover.enabled</name>
       <value>true</value>
     </property>    
    <!-- 不开启权限检查 -->
    <property>
      <name>dfs.permissions.enabled</name>
      <value>false</value>
    </property>

</configuration>

3.7.5 yarn-site.xml

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
<configuration>
	<!-- 指定MR走shuffle -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <!--启用resourcemanager ha-->
    <property>
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
    </property>
    <!--声明两台resourcemanager的地址-->
    <property>
        <name>yarn.resourcemanager.cluster-id</name>
        <value>cluster1</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.rm-ids</name>
        <value>rm1,rm2</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname.rm1</name>
        <value>hadoop161</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname.rm2</name>
        <value>hadoop163</value>
    </property>   
    <property>
      <name>yarn.resourcemanager.webapp.address.rm1</name>
      <value>hadoop161:8088</value>
    </property>
    <property>
      <name>yarn.resourcemanager.webapp.address.rm2</name>
      <value>hadoop163:8088</value>
    </property>
    <!--指定zookeeper集群的地址--> 
    <property>
        <name>yarn.resourcemanager.zk-address</name>
        <value>hadoop161:2181,hadoop162:2181,hadoop163:2181</value>
    </property>
    <!--启用自动恢复--> 
    <property>
        <name>yarn.resourcemanager.recovery.enabled</name>
        <value>true</value>
    </property>
    <!--指定resourcemanager的状态信息存储在zookeeper集群--> 
    <property>
        <name>yarn.resourcemanager.store.class</name>     <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>

</configuration>

3.7.5 mapred-site.xml

ref

1
2
3
4
5
6
7
8
9
<configuration>
    
	<!-- 指定MapReduce程序运行在Yarn上 -->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
 
</configuration>

3.7.6 workers

1
2
3
hadoop161
hadoop162
hadoop163

3.7.7 分发同步

1
2
xsync /opt/module/hadoop-ha
#xsync /opt/module/env.sh

3.7.8 手动启动

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#启动前,切记将之前的hadoop_home进行覆盖
#cd /opt/module/hadoop-ha/hadoop-3.1.3
#1.启动journalnode hadoop161 hadoop162 hadoop163
#sbin/hadoop-daemon.sh start journalnode
bin/hdfs --daemon start journalnode
bin/hdfs --daemon start journalnode
bin/hdfs --daemon start journalnode

#2.nn1上对namenode进行格式化 hadoop161
bin/hdfs namenode -format

#3.启动nn1的namenode,在 nn2和nn3上进行同步
#hadoop161
#sbin/hadoop-daemon.sh start namenode
bin/hdfs --daemon start namenode

#hadoop162 hadoop163
bin/hdfs namenode -bootstrapStandby
bin/hdfs namenode -bootstrapStandby
#hadoop162 hadoop163
#sbin/hadoop-daemon.sh start namenode
bin/hdfs --daemon start namenode
bin/hdfs --daemon start namenode


#4.关闭所有hdfs服务 hadoop161
sbin/stop-all.sh 

#5.初始化HA在Zookeeper中状态 hadoop161
bin/hdfs zkfc -formatZK

#6.启动集群服务 hadoop161
sbin/start-all.sh 

4.KAFKA

4.1 下载解压

1
2
3
4
5
wget https://archive.apache.org/dist/kafka/2.4.1/kafka_2.11-2.4.1.tgz
#md5sum kafka_2.11-2.4.1.tgz 
#3f543b8f6e7013819b41bd02b8aad49e  kafka_2.11-2.4.1.tgz
tar zxvf kafka_2.11-2.4.1.tgz -C /opt/module/
#/opt/module/kafka_2.11-2.4.1

4.2 环境变量

1
2
3
4
5
/opt/module/env.sh

#KAFKA_HOME
export KAFKA_HOME=/opt/module/kafka_2.11-2.4.1
export PATH=$PATH:$KAFKA_HOME/bin

4.3 配置文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# vim server.properties
输入以下内容:
#broker的全局唯一编号,不能重复
broker.id=161
#删除topic功能使能
delete.topic.enable=true
#处理网络请求的线程数量
num.network.threads=3
#用来处理磁盘IO的现成数量
num.io.threads=8
#发送套接字的缓冲区大小
socket.send.buffer.bytes=102400
#接收套接字的缓冲区大小
socket.receive.buffer.bytes=102400
#请求套接字的缓冲区大小
socket.request.max.bytes=104857600
#kafka运行日志存放的路径
log.dirs=/opt/module/kafka_2.11-2.4.1/logs
#topic在当前broker上的分区个数
num.partitions=1
#用来恢复和清理data下数据的线程数量
num.recovery.threads.per.data.dir=1
#segment文件保留的最长时间,超时将被删除
#默认数据保留7天注释
#log.retention.hours=168
#配置连接Zookeeper集群地址
zookeeper.connect=hadoop161:2181,hadoop162:2181,hadoop163:2181/kafaka

4.4 分发同步

1
2
xsync /opt/module/kafka_2.11-2.4.1
xsync /opt/module/env.sh

4.5 修改配置

1
2
#修改另外两台的机器broker.id为162和163
/opt/module/kafka_2.11-2.4.1/config/server.properties

4.6 群起脚本

4.6.1 kfk.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/bin/bash

if (($#==0))
then
    echo -e "\033[31m start, stop or status \033[0m"
    exit 1;
fi
case $1 in
"start")
	for i in hadoop161 hadoop162 hadoop163
	do
    	echo  -e "\033[32m =====================  $1ing kafka in $i  =======================\033[0m"
    	ssh $i "/opt/module/kafka_2.11-2.4.1/bin/kafka-server-start.sh -daemon /opt/module/kafka_2.11-2.4.1/config/server.properties" 2> /dev/null
	done
	;;
"stop")
	for i in hadoop161 hadoop162 hadoop163
	do
    	echo  -e "\033[31m =====================  $1ing kafka in $i  =======================\033[0m"
    	ssh $i "/opt/module/kafka_2.11-2.4.1/bin/kafka-server-stop.sh " 2> /dev/null
	done
	;;
"status")
	/bin/bash /home/forsre/bin/jpsall
	;;
*)
    	echo -e "\033[31m start, stop or status \033[0m"
	;; 
esac

4.6.2 systemd

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
#vim /etc/systemd/system/kafka.service
[Unit]
Description=Apache Kafka server (broker)
Documentation=http://kafka.apache.org/documentation.html
Requires=network.target
After=network.target zookeeper.service

[Service]
Type=forking
User=forsre
Group=forsre
Environment=JAVA_HOME=/opt/module/jdk1.8.0_192
ExecStart=/opt/module/kafka_2.11-2.4.0/bin/kafka-server-start.sh /opt/module/kafka_2.11-2.4.0/config/server.properties
ExecStop=/opt/module/kafka_2.11-2.4.0/bin/kafka-server-stop.sh

[Install]
WantedBy=multi-user.target

4.7 常用命令

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
Kafka命令行操作
1)查看当前服务器中的所有topic
 bin/kafka-topics.sh --zookeeper hadoop161:2181/kafka --list
2)创建topic
bin/kafka-topics.sh --zookeeper hadoop161:2181/kafka \
--create --replication-factor 3 --partitions 1 --topic first
#选项说明:
--topic 定义topic名
--replication-factor  定义副本数
--partitions  定义分区数
3)删除topic
bin/kafka-topics.sh --zookeeper hadoop161:2181/kafka \
--delete --topic first
#需要server.properties中设置delete.topic.enable=true否则只是标记删除。
4)发送消息
bin/kafka-console-producer.sh \
--broker-list hadoop161:9092 --topic first
5)消费消息
bin/kafka-console-consumer.sh \
--bootstrap-server hadoop161:9092 --from-beginning --topic first
#--from-beginning:会把主题中以往所有的数据都读取出来。
6)查看某个Topic的详情
bin/kafka-topics.sh --zookeeper hadoop161:2181/kafka \
--describe --topic first
7)修改分区数
bin/kafka-topics.sh --zookeeper hadoop161:2181/kafka --alter --topic first --partitions 6

4.8 监控 Kafka-Eagle

4.8.1 下载解压

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
#http://www.kafka-eagle.org/articles/docs/changelog/changelog.html
wget https://github.com/smartloli/kafka-eagle-bin/archive/v1.4.8.tar.gz
md5sum v1.4.8.tar.gz 
#261c661fb23a1f4792069a19ff1b2ffb  v1.4.8.tar.gz
tar zxvf v1.4.8.tar.gz 
tar zxvf kafka-eagle-web-1.4.8-bin.tar.gz -C /opt/module/

#/opt/module/env.sh

#kE_HOME
export KE_HOME=/opt/module/kafka-eagle-web-1.4.8
export PATH=$PATH:$KE_HOME/bin

#xsync /opt/module/env.sh

4.8.2 修改kafka jmx启动

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
#改kafka-server-start.sh命令中
if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
    export KAFKA_HEAP_OPTS="-Xmx1G -Xms1G"
fi
#
if [ "x$KAFKA_HEAP_OPTS" = "x" ]; then
    export KAFKA_HEAP_OPTS="-server -Xms2G -Xmx2G -XX:PermSize=128m -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:ParallelGCThreads=8 -XX:ConcGCThreads=5 -XX:InitiatingHeapOccupancyPercent=70"
    export JMX_PORT="9999"
    #export KAFKA_HEAP_OPTS="-Xmx1G -Xms1G"
fi
#
xsync kafka-server-start.sh

4.8.3 配置文件

1
2
3
4
5
6
7
#面system-config.properties
cluster1.zk.list=hadoop161:2181,hadoop162:2181,hadoop163:2181
cluster1.kafka.eagle.offset.storage=kafka
kafka.eagle.metrics.charts=true
kafka.eagle.url=jdbc:sqlite:/opt/module/kafka-eagle-web-1.4.8/db/ke.db
kafka.eagle.username=forsre
kafka.eagle.password=123456

4.8.4 启动

1
2
3
chmod +x /opt/module/kafka-eagle-web-1.4.8/bin/ke.sh
ke.sh start
http://192.168.8.161:8048/ke

5. Flume

5.1 下载解压

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
#http://flume.apache.org/download.html
#https://downloads.apache.org/
wget https://downloads.apache.org/flume/1.9.0/apache-flume-1.9.0-bin.tar.gz
#md5sum apache-flume-1.9.0-bin.tar.gz 
#cc17aaa30cbd051eaa2cdbecd73c3bab  apache-flume-1.9.0-bin.tar.gz
tar -zxvf apache-flume-1.9.0-bin.tar.gz -C /opt/module/
mv /opt/module/apache-flume-1.9.0-bin /opt/module/flume-1.9.0
#将lib文件夹下的guava-11.0.2.jar删除以兼容Hadoop 3.1.3
rm /opt/module/flume-1.9.0/lib/guava-11.0.2.jar
# on hadoop 3.13
#ls -la /opt/module/hadoop-3.1.3/share/hadoop/common/lib | grep guava*
guava-27.0-jre.jar

5.2 环境变量

1
2
3
mv flume-env.sh.template flume-env.sh
#flume-env.sh
export JAVA_HOME=/opt/module/jdk1.8.0_202

6.Mysql

6.1 Mysql

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
#https://dev.mysql.com/downloads/mysql/5.7.html
#https://downloads.mysql.com/archives/community/
#https://forsre.cn/pages/linux%E5%AE%89%E8%A3%85mysql%E7%A4%BE%E5%8C%BA%E7%89%88/
wget https://downloads.mysql.com/archives/get/p/23/file/mysql-5.7.30-1.el7.x86_64.rpm-bundle.tar
tar xvf mysql-5.7.30-1.el7.x86_64.rpm-bundle.tar
#md5sum mysql-5.7.30-1.el7.x86_64.rpm-bundle.tar 
#56b94aef45542efdc8714423e0bd241f  mysql-5.7.30-1.el7.x86_64.rpm-bundle.tar
#install
sudo yum -y install mysql-community-{server,client,common,libs}-*
#init
sudo systemctl start mysqld.service
sudo grep 'temporary password' /var/log/mysqld.log
sudo systemctl enable mysqld.service
#password as 000000 for test
set global validate_password_policy=LOW;
set global validate_password_length=6;
alter user 'root'@'localhost' identified by '000000';
SHOW VARIABLES LIKE 'validate_password%'; 
use mysql;
update user set host='%' where user='root';
flush privileges;
select user,host from user;

6.2 Mysql-connector

1
2
3
4
5
# https://dev.mysql.com/downloads/connector/j/
# https://downloads.mysql.com/archives/c-j/
wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-5.1.49.tar.gz
or
wget https://downloads.mysql.com/archives/get/p/3/file/mysql-connector-java-5.1.49.zip

Summary of Connector/J Versions

Connector/J version JDBC version MySQL Server version JRE Required JDK Required for Compilation Status
5.1 3.0, 4.0, 4.1, 4.2 5.61, 5.71, 8.01 JRE 5 or higher1 JDK 5.0 AND JDK 8.0 or higher2, 3 General availability
8.0 4.2 5.6, 5.7, 8.0 JRE 8 or higher JDK 8.0 or higher2 General availability. Recomme

7.Hive

7.1 下载安装

如果采用的是hive on mr或者hive on tez或者是hive on spark(官方默认值支持spark版本2.x)的不用编译hive包,直接采用官网的3.1.2版本即可,只需要处理下jar包冲突即可:

1
2
3
4
cd /opt/module/apache-hive-3.1.2/lib
mv log4j-slf4j-impl-2.10.0.jar log4j-slf4j-impl-2.10.0.jar.bak
mv guava-19.0.jar guava-19.0.jar.bak
cp  /opt/module/hadoop-3.1.3/share/hadoop/common/lib/guava-27.0-jre.jar ./

hive on spark

但是由于这次环境必须使用spark3.0.0作为hive引擎,所以采用了编译安装的方法:

文档

1
2
3
4
5
6
7
8
#git clone https://github.com/forsre/hive3.1.2.git
git clone https://gitee.com/forsre/hive3.1.2.git
cd hive3.1.2 
mvn clean package -DskipTests -Pdist
jar包位于packaging/target

tar zxvf apache-hive-3.1.2-bin.tar.gz -C /opt/module/
mv /opt/module/apache-hive-3.1.2-bin  /opt/module/apache-hive-3.1.2

7.2 环境变量

1
2
3
4
5
6
7
8
9
/opt/module/env.sh

#HIVE_HOME
export HIVE_HOME=/opt/module/apache-hive-3.1.2
export PATH=$PATH:$KAFKA_HOME/bin

#解决日志Jar包冲突,进入/opt/module/apache-hive-3.1.2/lib目录
cd /opt/module/apache-hive-3.1.2/lib
mv log4j-slf4j-impl-2.10.0.jar log4j-slf4j-impl-2.10.0.jar.bak

7.3 元数据配置至Mysql

1
cp /opt/soft/mysql-connector-java-5.1.49/mysql-connector-java-5.1.49-bin.jar /opt/module/apache-hive-3.1.2/lib/

7.4 配置文件 hive-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>


<configuration>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://hadoop161:3306/metastore?useSSL=false</value>
    </property>

    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
    </property>

    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>root</value>
    </property>

    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>000000</value>
    </property>

    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/user/hive/warehouse</value>
    </property>

    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://hadoop161:9083</value>
    </property>

    <property>
        <name>hive.metastore.schema.verification</name>
        <value>false</value>
    </property>

    <property>
    <name>hive.server2.thrift.port</name>
    <value>10000</value>
    </property>

    <property>
        <name>hive.server2.thrift.bind.host</name>
        <value>hadoop161</value>
    </property>

    <property>
        <name>hive.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>
    
    <property>
        <name>hive.cli.print.header</name>
        <value>true</value>
    </property>

    <property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
    </property>
</configuration>

7.5 修改日志文件

1
2
3
4
mv hive-log4j2.properties.template hive-log4j2.properties

#hive-log4j2.properties
hive.log.dir=/opt/module/apache-hive-3.1.2/logs

7.6 初始化启动

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
#mysql
mysql -uroot -p000000
create database metastore;
quit;
#初始化Hive元数据库
schematool -initSchema -dbType mysql -verbose

#hive

#http://hadoop161:10002 webui

7.7 启动脚本 hs.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/bin/bash
HIVE_LOG_DIR=$HIVE_HOME/logs
if [ ! -d $HIVE_LOG_DIR ]
then
	mkdir -p $HIVE_LOG_DIR
fi
#检查进程是否运行正常,参数1为进程名,参数2为进程端口
function check_process()
{
    pid=$(ps -ef 2>/dev/null | grep -v grep | grep -i $1 | awk '{print $2}')
    ppid=$(netstat -nltp 2>/dev/null | grep $2 | awk '{print $7}' | cut -d '/' -f 1)
    echo $pid
    [[ "$pid" =~ "$ppid" ]] && [ "$ppid" ] && return 0 || return 1
}

function hive_start()
{
    metapid=$(check_process HiveMetastore 9083)
    cmd="nohup hive --service metastore >$HIVE_LOG_DIR/metastore.log 2>&1 &"
    [ -z "$metapid" ] && eval $cmd || echo "Metastroe服务已启动"
    server2pid=$(check_process HiveServer2 10000)
    cmd="nohup hiveserver2 >$HIVE_LOG_DIR/hiveServer2.log 2>&1 &"
    [ -z "$server2pid" ] && eval $cmd || echo "HiveServer2服务已启动"
}

function hive_stop()
{
metapid=$(check_process HiveMetastore 9083)
    [ "$metapid" ] && kill $metapid || echo "Metastore服务未启动"
    server2pid=$(check_process HiveServer2 10000)
    [ "$server2pid" ] && kill $server2pid || echo "HiveServer2服务未启动"
}

case $1 in
"start")
    hive_start
    ;;
"stop")
    hive_stop
    ;;
"restart")
    hive_stop
    sleep 2
    hive_start
    ;;
"status")
    check_process HiveMetastore 9083 >/dev/null && echo "Metastore服务运行正常" || echo "Metastore服务运行异常"
    check_process HiveServer2 10000 >/dev/null && echo "HiveServer2服务运行正常" || echo "HiveServer2服务运行异常"
    ;;
*)
    echo Invalid Args!
    echo 'Usage: '$(basename $0)' start|stop|restart|status'
    ;;
esac

7.8 hive on spark(和tez配一个就行)

doc

7.8.1 下载解压

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-without-hadoop.tgz
#md5sum spark-3.0.0-bin-*
#c5c0af4b4b9cb21d214c7026439df236  spark-3.0.0-bin-hadoop3.2.tgz
#039f2a8ec971705063ac0bfb5c2a7959  spark-3.0.0-bin-without-hadoop.tgz
tar zxvf spark-3.0.0-bin-hadoop3.2.tgz -C /opt/module
mv /opt/module/spark-3.0.0-bin-hadoop3.2 /opt/module/spark-3.0.0
tar zxvf spark-3.0.0-bin-without-hadoop.tgz -C /opt/module
hadoop fs -mkdir /spark3.0.0-jars
hadoop fs -put /opt/module/spark-3.0.0-bin-without-hadoop/jars/* /spark3.0.0-jars

7.8.2 配置文件

7.8.2.1 hive-site.xml

hive.spark.client.connect.timeout的默认值是1000ms,如果执行hive的insert语句时,抛如异常,可以调大该参数到10000ms

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
<!--Spark依赖位置(注意:端口号8020必须和namenode的端口号一致)-->
<property>
    <name>spark.yarn.jars</name>
    <value>hdfs://hadoop161:8020/spark3.0.0-jars/*</value>
</property>
  
<!--Hive执行引擎-->
<property>
    <name>hive.execution.engine</name>
    <value>spark</value>
</property>

<!--Hive和Spark连接超时时间-->
<property>
    <name>hive.spark.client.connect.timeout</name>
    <value>10000ms</value>
</property>
7.8.2.2 spark-defaults.conf
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#先创建spark-history
hadoop fs -mkdir /spark-history

#  dynamicAllocation.enabled  需要yarn配置
#1.Build Spark with the YARN profile. Skip this step if you are using a pre-packaged distribution.
#2.Locate the spark-<version>-yarn-shuffle.jar. This should be under $SPARK_HOME/network/yarn/target/scala-<version> if you are building Spark yourself, and under lib if you are using a distribution.Add this jar to the classpath of all NodeManagers in your cluster.
#3.In the yarn-site.xml on each node, add spark_shuffle to yarn.nodemanager.aux-services, then set yarn.nodemanager.aux-services.spark_shuffle.class to org.apache.spark.network.yarn.YarnShuffleService.
#4.Restart all NodeManagers in your cluster.
#https://community.cloudera.com/t5/Support-Questions/Spark-dynamic-allocation-dont-work/td-p/140227

#或者将配置文件写入hive-site.xml
#Configure Spark-application configs for Hive.  See: http://spark.apache.org/docs/latest/configuration.html.  This can be done either by adding a file "spark-defaults.conf" with these properties to the Hive classpath, or by setting them on Hive configuration (hive-site.xml). For instance:


# /opt/module/apache-hive-3.1.2/conf/spark-defaults.conf
spark.home							  /opt/module/spark-3.0.0
spark.master                             yarn
spark.eventLog.enabled                   true
spark.eventLog.dir                       hdfs://hadoop161:8020/spark-history
spark.executor.memory                    2g
spark.driver.memory					   2g
#spark.dynamicAllocation.enabled 		true   #需要yarn开启
spark.serializer					   org.apache.spark.serializer.KryoSerializer

or hive-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
<property>
    <name>spark.home</name>
    <value>/opt/module/spark-3.0.0</value>
</property>
<property>
    <name>spark.master</name>
    <value>yarn</value>
</property>
<property>
    <name>spark.eventLog.enabled</name>
    <value>true</value>
</property>
<property>
    <name>spark.eventLog.dir</name>
    <value>hdfs://hadoop161:8020/spark-history</value>
</property>
<property>
    <name>spark.executor.memory</name>
    <value>2g</value>
</property>
<property>
    <name>spark.driver.memory</name>
    <value>2g</value>
</property>
<!--
<property>
    <name>spark.dynamicAllocation.enabled</name>
    <value>true</value>
</property>
-->
<property>
    <name>spark.serializer</name>
    <value>org.apache.spark.serializer.KryoSerializer</value>
</property>

7.8.3 capacity-scheduler.xml 调优

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
#增加ApplicationMaster资源比例
#容量调度器对每个资源队列中同时运行的Application Master占用的资源进行了限制,该限制通过yarn.scheduler.capacity.maximum-am-resource-percent参数实现,其默认值是0.1,表示每个资源队列上Application Master最多可使用的资源为该队列总资源的10%,目的是防止大部分资源都被Application Master占用,而导致Map/Reduce Task无法执行。
#生产环境该参数可使用默认值。但学习环境,集群资源总数很少,如果只分配10%的资源给Application Master,则可能出现,同一时刻只能运行一个Job的情况,因为一个Application Master使用的资源就可能已经达到10%的上限了。故此处可将该值适当调大。
#/opt/module/hadoop-3.1.3/etc/hadoop/capacity-scheduler.xml文件中修改如下参数值


<property>
    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
    <value>0.8</value>
</property>

#capacity-scheduler.xml配置文件
xsync capacity-scheduler.xml
#重新启动yarn或者执行yarn rmadmin -refreshQueues命令
yarn rmadmin -refreshQueues
sbin/stop-yarn.sh
sbin/start-yarn.sh

7.8.4 测试

1
2
create table student(id int, name string);
insert into table student values(1,'abc');

7.9 hive on tez

7.9.1 下载安装

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# https://tez.apache.org/install.html
# https://github.com/apache/tez
# https://github.com/apache/tez/blob/rel/release-0.9.2/pom.xml
# https://github.com/apache/tez/blob/rel/release-0.10.1/pom.xml
# 从git上的pom文件上可知tez从0.10.0开始支持hadoop3.13的,之前的版本最多支持到2.7.2,当然编译源码的无所谓这个依赖了,反正可以修改
#wget https://downloads.apache.org/tez/0.10.1/apache-tez-0.10.1-src.tar.gz
#wget https://downloads.apache.org/tez/0.10.0/apache-tez-0.10.0-src.tar.gz
#安装依赖
#缺少依赖会报错
#org.apache.maven.plugin.MojoExecutionException: 'protoc --version' did not return a version 
#sudo yum -y install protobuf protobuf-devel
#或者编译安装依赖
# wget https://github.com/google/protobuf/releases/download/v2.5.0/protobuf-2.5.0.tar.gz
# tar -xzvf protobuf-2.5.0.tar.gz
# cd protobuf-2.5.0 && ./configure --prefix=/usr && make && sudo make install
# mvn clean package -DskipTests=true -Dmaven.javadoc.skip=true
# jar包位于 tez-dist/target,其中minimal包组放置本机,全量包上传至HDFS,编译完成后将两个jar包上传到机器上
#tez-0.10.1-minimal.tar.gz
#tez-0.10.1.tar.gz
#解压
#mkdir /opt/module/apache-tez-0.10.1
#tar zxvf tez-0.10.1.tar.gz -C /opt/module/apache-tez-0.10.1/
1.下载解压
wget https://downloads.apache.org/tez/0.10.1/apache-tez-0.10.1-bin.tar.gz
#md5sum  apache-tez-0.10.1-bin.tar.gz 
#6a654ee5fbd45a13ab2e0df44965bb1d  apache-tez-0.10.1-bin.tar.gz
tar zxvf apache-tez-0.10.1-bin.tar.gz -C /opt/module/
mv /opt/module/apache-tez-0.10.1-bin/ /opt/module/apache-tez-0.10.1/
# 解压完成后,目录内的share里面的tez.tar.gz是上传到HDFS里面的全量包,外面的jar包和lib里面的jar包是放置到本机的依赖,和自己编译的那个mininal类似。
#解决jar包冲突
mv /opt/module/apache-tez-0.10.1/lib/slf4j-log4j12-1.7.30.jar{,.bak}
#上传hdfs
hadoop fs -mkdir /tez-0.10.1
hadoop fs -put /opt/module/apache-tez-0.10.1/share/tez.tar.gz /tez-0.10.1

7.9.2 配置文件

TEZ引擎可以配置成两种模式,一种是Hadoop全局引擎,可以修改mapred-site.xml启用全局模式,另一种是只是适配于Hive,就是Hive on Tez模式,我这里采用的Hive On Tez模式:

7.9.2.1 Hive On Tez

[ref]https://tez.apache.org/releases/0.10.1/tez-api-javadocs/configs/TezConfiguration.html

7.9.2.1.1 拷贝jar包
1
2
cp -av /opt/module/apache-tez-0.10.1/*.jar /opt/module/apache-hive-3.1.2/lib/
cp -av /opt/module/apache-tez-0.10.1/lib/*.jar /opt/module/apache-hive-3.1.2/lib/
7.9.2.1.2 hive-site.xml
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
<property>
    <name>hive.execution.engine</name>
    <value>tez</value>
</property>
<property>
    <name>hive.tez.container.size</name>
    <value>2048</value>
</property>

<property>
	<name>tez.lib.uris</name>
    <value>hdfs://hadoop161:8020/tez-0.10.1/tez.tar.gz</value>
</property>
<property>
     <name>tez.use.cluster.hadoop-libs</name>
     <value>true</value>
</property>
<property>
     <name>tez.am.resource.memory.mb</name>
     <value>1024</value>
</property>
<property>
     <name>tez.am.resource.cpu.vcores</name>
     <value>2</value>
</property>
<property>
     <name>tez.container.max.java.heap.fraction</name>
     <value>0.8</value>
</property>
<property>
     <name>tez.task.resource.memory.mb</name>
     <value>1024</value>
</property>
<property>
     <name>tez.task.resource.cpu.vcores</name>
     <value>2</value>
</property>
<property>
     <name>tez.tez-ui.history-url.base</name>
     <value>hadoop161:8999</value>
</property>
7.9.2.2 Hdoop On Tez
7.9.2.2.1 tez-site.xml

位于 /opt/module/hadoop-3.1.3/etc/hadoop/tez-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
	<name>tez.lib.uris</name>
    <value>hdfs://hadoop161:8020/tez-0.10.1/tez.tar.gz</value>
</property>
<property>
     <name>tez.use.cluster.hadoop-libs</name>
     <value>true</value>
</property>
<property>
     <name>tez.am.resource.memory.mb</name>
     <value>2048</value>
</property>
<property>
     <name>tez.am.resource.cpu.vcores</name>
     <value>2</value>
</property>
<property>
     <name>tez.container.max.java.heap.fraction</name>
     <value>0.4</value>
</property>
<property>
     <name>tez.task.resource.memory.mb</name>
     <value>2048</value>
</property>
<property>
     <name>tez.task.resource.cpu.vcores</name>
     <value>2</value>
</property>
</configuration>
7.9.2.2.2 hive-site.xml
1
2
3
4
5
6
7
8
<property>
    <name>hive.execution.engine</name>
    <value>tez</value>
</property>
<property>
    <name>hive.tez.container.size</name>
    <value>2048</value>
</property>
7.9.2.2.3 环境变量

/opt/module/hadoop-3.1.3/etc/hadoop/shellprofile.d/tez.sh

1
2
3
4
5
6
7
8
9
# /opt/module/hadoop-3.1.3/etc/hadoop/shellprofile.d/tez.sh

hadoop_add_profile tez
function _tez_hadoop_classpath
{
    hadoop_add_classpath "$HADOOP_HOME/etc/hadoop" after
    hadoop_add_classpath "/opt/module/apache-tez-0.10.1/*" after
    hadoop_add_classpath "/opt/module/apache-tez-0.10.1/lib/*" after
}
7.9.2.2.4 重启Hadoop集群

这里只有一台机器部署了Hive,所有不用分发Hadoop更改

7.9.3 测试

1
2
create table student(id int, name string);
insert into table student values(1,'abc');

7.10 常用命令

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
#1.退出hive窗口:
hive(default)>exit;
hive(default)>quit;
在新版的hive中没区别了,在以前的版本是有的:
exit:先隐性提交数据,再退出;
quit:不提交数据,退出;
#2.查看hdfs文件系统
dfs -ls /;
#3.本地文件系统
! ls /opt/module/datas;
#4.历史命令
.hivehistory文件
#5.“-e”不进入hive的交互窗口执行sql语句
bin/hive -e "select id from student;"
#6.“-f”执行脚本中sql语句
#(1)在/opt/module/datas目录下创建hivef.sql文件
vim hivef.sql
select *from student;
#(2)执行文件中的sql语句
bin/hive -f /opt/module/datas/hivef.sql
#(3)执行文件中的sql语句并将结果写入文件中
bin/hive -f /opt/module/datas/hivef.sql  > /opt/module/datas/hive_result.txt

8.Sqoop

8.1下载解压

1
2
3
4
5
6
# http://archive.apache.org/dist/sqoop/1.4.7/
wget http://archive.apache.org/dist/sqoop/1.4.7/sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz
md5sum sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz 
#390122c6714fb5a09ec45aff4cbb0052  sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz
tar zxvf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz -C /opt/module/
mv /opt/module/sqoop-1.4.7.bin__hadoop-2.6.0/ /opt/module/sqoop-1.4.7

8.2 修改配置

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
cd /opt/module/sqoop-1.4.7/conf
mv sqoop-env-template.sh sqoop-env.sh

# vim sqoop-env.sh
export HADOOP_COMMON_HOME=/opt/module/hadoop-3.1.3
export HADOOP_MAPRED_HOME=/opt/module/hadoop-3.1.3
export HIVE_HOME=/opt/module/apache-hive-3.1.2
export ZOOKEEPER_HOME=/opt/module/zookeeper-3.5.7
export ZOOCFGDIR=/opt/module/zookeeper-3.5.7/conf



/opt/module/env.sh

#SQOOP_HOME
export SQOOP_HOME=/opt/module/sqoop-1.4.7
export PATH=$PATH:$SQOOP_HOME/bin

#jdbc
cp /opt/soft/mysql-connector-java-5.1.49/mysql-connector-java-5.1.49-bin.jar /opt/module/sqoop-1.4.7/lib/

8.3 测试启动

1
2
3
4
5
# env test
sqoop help

#db test
sqoop list-databases --connect jdbc:mysql://hadoop161:3306/ --username root --password 000000

9.Hbase

https://cdn.forsre.cn/hbase_hadoop_version.png

9.1 下载安装

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
# https://github.com/apache/hbase/blob/rel/2.0.5/pom.xml
# https://archive.apache.org/dist/hbase/2.0.5/
根据源码的pom文件可知,没有兼容hadoop3.1.3的版本,所以需要编译源码包:
wget https://archive.apache.org/dist/hbase/2.0.5/hbase-2.0.5-src.tar.gz
#编译
# https://hbase.apache.org/book.html#quickstart
# http://www.kailing.pub/article/index/arcid/267.html
mvn clean package -DskipTests assembly:single -Dhadoop.profile=3.0 -Dhadoop-three.version=3.1.3 -Denforcer.fail=false
#方案一:调整本地环境满足当前的插件约束,比如楼主这里只需要升级本地的maven版本到3.5.0及以上即可解决问题
#方案二:跳过enforcer的强制约束,在构建的命令加上跳过的指令,如:mvn install -Denforcer.skip=true
#方案三:设置规则校验失败不影响构建流程,在构建的命令上加指令,如:mvn install -Denforcer.fail=false
#
jar 包位于 hbase-assembly/target
将jar包 hbase-2.0.5-bin.tar.gz 上传至服务器上
#解压
tar -zxvf hbase-2.0.5-bin.tar.gz -C /opt/module
#jar包冲突
mv /opt/module/hbase-2.0.5/lib/slf4j-log4j12-1.7.25.jar{,.bak}

9.2 环境变量

1
2
3
4
5
/opt/module/env.sh

#HBASE_HOME
export HBASE_HOME=/opt/module/hbase-2.0.5
export PATH=$PATH:$HBASE_HOME/bin

9.3 配置文件

9.3.1 hbase-env.sh

1
export HBASE_MANAGES_ZK=false

9.3.2 hbase-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
<configuration>
  <property>
    <name>hbase.rootdir</name>
    <value>hdfs://hadoop161:8020/hbase-2.0.5</value>
  </property>
  <property>
    <name>hbase.cluster.distributed</name>
    <value>true</value>
  </property>
  <property>
    <name>hbase.zookeeper.quorum</name>
    <value>hadoop161,hadoop162,hadoop163</value>
  </property>
</configuration>

9.3.3 regionservers

1
2
3
hadoop161
hadoop162
hadoop163

9.3.4 软连接hadoop配置文件到HBase

1
2
ln -s /opt/module/hadoop-3.1.3/etc/hadoop/core-site.xml /opt/module/hbase-2.0.5/conf/core-site.xml
ln -s /opt/module/hadoop-3.1.3/etc/hadoop/hdfs-site.xml /opt/module/hbase-2.0.5/conf/hdfs-site.xml

9.4 同步分发

1
2
xsync /opt/module/env.sh
xsync /opt/module/hbase-2.0.5

9.5 启动

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
#单点启动
bin/hbase-daemon.sh start master
bin/hbase-daemon.sh start regionserver

#群起
bin/start-hbase.sh
#群关
bin/stop-hbase.sh

#访问页面
http://hadoop161:16010 

9.6 错误排查

如果集群之间的节点时间不同步,会导致regionserver无法启动,抛出ClockOutOfSyncException异常。

修复方式1:同步集群时间,我这里采用的是chronyd同步;

修复方式2:hbase.master.maxclockskew设置更大的值

1
2
3
4
5
<property>
        <name>hbase.master.maxclockskew</name>
        <value>180000</value>
        <description>Time difference of regionserver from master</description>
</property>

9.7 HA配置(选配)

9.7.1 停止原集群

9.7.2 修改配置并同步

1
2
3
touch conf/backup-masters
echo hadoop163 > conf/backup-masters
xsync conf

10. Phoenix

10.1 下载解压

11. Presto

12. Kylin

13.Superse

14. Kerberos

15.Ranger

16.Atlas

17.ES

17.1 下载安装

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
#首先修改系统配置
#/etc/sysctl.conf 
vm.max_map_count=262144
sysctl -p
#
#/etc/security/limits.conf
* soft nofile 65536
* hard nofile 131072
* soft nproc 2048
* hard nproc 65536

# https://www.elastic.co/cn/downloads/past-releases/#elasticsearch
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.8.8.tar.gz
wget https://artifacts.elastic.co/downloads/kibana/kibana-6.8.8-linux-x86_64.tar.gz
wget https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.8.8/elasticsearch-analysis-ik-6.8.8.zip
tar zxvf elasticsearch-6.8.8.tar.gz -C /opt/module
tar zxvf kibana-6.8.8-linux-x86_64.tar.gz -C /opt/module
mv kibana-6.8.8-linux-x86_64/ kibana-6.8.8

17.2 配置文件

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#-Xms: 指定虚拟机堆内存初始值大小
#-Xmx: 指定虚拟机堆内存最大值大小
#把两者设置为一致,是为了避免频繁扩容和GC释放堆内存造成的系统开销/压力
#Oracle recommends setting the minimum heap size (-Xms)equal to the maximum heap size (-Xmx) to minimize garbage collections.
# https://docs.oracle.com/cd/E15523_01/web.1111/e13814/jvm_tuning.htm#PERFM161

mkdir /opt/module/elasticsearch-6.8.8/run
#vim config/jvm.options

-Djava.net.preferIPv4Stack=true
-Xms512m
-Xmx512m

vim config/elasticsearch.yml 

cluster.name: es.forsre.cn
node.name: hadoop161
bootstrap.memory_lock: false
network.host: hadoop161
http.port: 9200
discovery.zen.ping.unicast.hosts: ["hadoop161", "hadoop162", "hadoop163"]
discovery.zen.minimum_master_nodes: 2
cluster.routing.allocation.disk.threshold_enabled: false

17.3 分发同步

1
2
3
 xsync /opt/module/elasticsearch-6.8.8/
 
 #分发完成后,修改后面两台机器主机node.name和network.host

17.4 配置kibana

1
2
3
4
5
6
# mkdir /opt/module/kibana-6.8.8/logs

#vim config/kibana.yml 

server.host: "hadoop161"
elasticsearch.hosts: ["http://hadoop161:9200", "http://hadoop162:9200", "http://hadoop163:9200"]

17.5 启动脚本

17.5.1 es.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/bin/bash 

es_home=/opt/module/elasticsearch-6.8.8
kibana_home=/opt/module/kibana-6.8.8
case $1  in
 "start") {
  for i in hadoop161 hadoop162 hadoop163
  do
    echo "==============$i starting=============="
    ssh $i  "source /opt/module/env.sh;${es_home}/bin/elasticsearch >/dev/null 2>&1 &"
done
${kibana_home}/bin/kibana -l ${kibana_home}/logs/kibana.log 2>&1 &
 };;


"stop") {
  ps -ef|grep ${kibana_home} |grep -v grep|awk '{print $2}'|xargs kill
  for i in hadoop161 hadoop162 hadoop163
  do
      echo "==============$i stopping=============="
      ssh $i "ps -ef|grep $es_home |grep -v grep|awk '{print \$2}'|xargs kill" >/dev/null 2>&1
  done
  };;


"status") {
  for i in hadoop161 hadoop161 hadoop163
  do
      ssh $i "ps -ef | grep 6.8.8 | grep -v grep"
  done
};;
esac

17.5.2 systemd

elasticsearch.service

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
mkdir /opt/module/elasticsearch-6.8.8/run
#vim /etc/systemd/system/elasticsearch.service 
[Unit]
Description=Elasticsearch
After=network.target

[Service]
Type=simple
PrivateTmp=false
Environment=PID_DIR=/opt/module/elasticsearch-6.8.8/run
Environment=JAVA_HOME=/opt/module/jdk1.8.0_202
ExecStart=/opt/module/elasticsearch/bin/elasticsearch -p ${PID_DIR}/elasticsearch.pid --quiet
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
RestartSec=42s
LimitNOFILE=65535
LimitNPROC=4096
User=forsre

[Install]
WantedBy=multi-user.target

kibana.service

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
mkdir /opt/module/kibana-6.8.8/logs
# vim /etc/systemd/system/kibana.service 
[Unit]
Description=Kibana
After=network.target elasticsearch.service

[Service]
Type=simple
ExecStart=/opt/module/kibana-6.8.8/bin/kibana -l /opt/module/kibana-6.8.8/logs/kibana.log
KillMode=process
Restart=on-failure
User=forsre
RestartSec=42s

[Install]
WantedBy=multi-user.target  

17.6 中文分词插件

分词器 优势 劣势
Smart Chinese Analysis 官方插件 中文分词效果惨不忍睹
IKAnalyzer 简单易用,支持自定义词典和远程词典 词库需要自行维护,不支持词性识别
结巴分词 新词识别功能 不支持词性识别
Ansj中文分词 分词精准度不错,支持词性识别 对标hanlp词库略少,学习成本高
Hanlp 目前词库最完善,支持的特性非常多 需要更优的分词效果,学习成本高
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# https://github.com/medcl/elasticsearch-analysis-ik
wget https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.8.8/elasticsearch-analysis-ik-6.8.8.zip
unzip elasticsearch-analysis-ik-6.8.8.zip -d /opt/module/elasticsearch-6.8.8/plugins/ik
#查看ik/conf下的文件,分词就是将所有词汇分好放到文件中
#xsync
xsync  /opt/module/elasticsearch-6.8.8/plugins/ik
#es restart
es.sh stop
es.sh start
#test
         

17.7 启动测试

1
2
3
4
5
6
# es
http://hadoop161:9200
# kibana
http://hadoop161:5601
#node info
http://hadoop161:9200/_cat/nodes?v

18.ClickHouse

18.1下载安装

1
2
3
4
5
6
7
8
9

# https://repo.yandex.ru/clickhouse/rpm/stable/x86_64/
# https://repo.yandex.ru/clickhouse/rpm/lts/x86_64/
# https://repo.yandex.ru/clickhouse/deb/lts/main/
wget https://repo.yandex.ru/clickhouse/rpm/lts/x86_64/clickhouse-client-20.8.18.32-2.noarch.rpm
wget https://repo.yandex.ru/clickhouse/rpm/lts/x86_64/clickhouse-common-static-20.8.18.32-2.x86_64.rpm
wget https://repo.yandex.ru/clickhouse/rpm/lts/x86_64/clickhouse-server-20.8.18.32-2.noarch.rpm
#install
sudo yum install clickhouse* -y

18.2 修改配置文件

1
2
3
#conf
sudo vim /etc/clickhouse-server/config.xml
<listen_host>0.0.0.0</listen_host>

18.3 启动测试

1
2
3
4
5
6
7
8
#start
sudo systemctl start clickhouse-server
sudo systemctl enable clickhouse-server
#test
#local
clickhouse-client -m
#remote
clickhouse-client --host=hadoop161 -m

18.4 副本(高可用)

副本的目的主要是保障数据的高可用性,即使一台clickhouse节点宕机,那么也可以从其他服务器获得相同的数据。clickhouse的副本严重依赖zookeeper, 用于通知副本server状态变更。副本是表级别的,不是整个服务器级的。所以,服务器里可以同时有复本表和非复本表。

18.5 分片集群

复本虽然能够提高数据的可用性,降低丢失风险,但是对数据的横向扩容没有解决。每台机子实际上必须容纳全量数据。要解决数据水平切分的问题,需要引入分片的概念。通过分片把一份完整的数据进行切分,不同的分片分布到不同的节点上。在通过Distributed表引擎把数据拼接起来一同使用。Distributed表引擎本身不存储数据,有点类似于MyCat之于MySql,成为一种中间件,通过分布式逻辑表来写入、分发、路由来操作多台节点不同分片的分布式数据。

19.Spark

19.1 下载解压

1
2
3
4
wget https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
md5sum spark-3.0.0-bin-hadoop3.2.tgz 
c5c0af4b4b9cb21d214c7026439df236  spark-3.0.0-bin-hadoop3.2.tgz
tar zxvf spark-3.0.0-bin-hadoop3.2.tgz -C /opt/module/

19.2 部署安装

19.2.1 local模式

local模式就是本地模式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
cp -av /opt/module/spark-3.0.0-bin-hadoop3.2/ /opt/module/spark-3.0.0-local/
cd /opt/module/spark-3.0.0-local/
#官方WordCount案例
#本地模式下,默认的调度器为FIFO。
#注意:只有collect开始执行时,才会加载数据。
vim data/word.txt
```
spark 
flink 
hive 
spark
storm
```
./bin/spark-shell
#hadoop161:4040
sc.textFile("data/word.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect
#退出
:quit

#官方求PI案例
#该算法是利用蒙特·卡罗算法求PI
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master local[2] ./examples/jars/spark-examples_2.12-3.0.0.jar \
10
#--class:表示要执行程序的主类;
#--master local[2]
(1)local: 没有指定线程数,则所有计算都运行在一个线程当中,没有任何并行计算
(2)local[K]:指定使用K个Core来运行计算,比如local[2]就是运行2个Core来执行
(3)local[*]: 自动帮你按照CPU最多核来设置线程数。比如CPU有4核,Spark帮你自动设置4个线程计算。
#spark-examples_2.12-3.0.0.jar:要运行的程序;
#10:要运行程序的输入参数(计算圆周率π的次数,计算次数越多,准确率越高);

19.2.2 Standalone模式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#Standalone模式是Spark自带的资源调动引擎,构建一个由Master + Slave构成的Spark集群,Spark运行在集群中。这个要和Hadoop中的Standalone区别开来。这里的Standalone是指只用Spark来搭建一个集群,不需要借助其他的框架。是相对于Yarn和Mesos来说的。

cp -av /opt/module/spark-3.0.0-bin-hadoop3.2/ /opt/module/spark-3.0.0-standalone
cd /opt/module/spark-3.0.0-standalone/conf
mv slaves.template slaves
#vim slaves
hadoop161
hadoop162
hadoop163
mv spark-env.sh.template spark-env.sh
#vim spark-env.sh
#默认webui端口是8080,和zk冲突,改成了8088
#由于spark-shell停止掉后,4040页面就看不到历史任务的运行情况,所以配置历史服务器记录任务运行情况
SPARK_MASTER_HOST=hadoop163
SPARK_MASTER_PORT=7077
SPARK_MASTER_WEBUI_PORT=18088
export SPARK_HISTORY_OPTS="
-Dspark.history.ui.port=18080 
-Dspark.history.fs.logDirectory=hdfs://hadoop161:8020/spark-standalone
-Dspark.history.retainedApplications=30"

#创建目录
hadoop fs -mkdir /spark-standalone

mv spark-defaults.conf.template spark-defaults.conf
#vim spark-defaults.conf
spark.eventLog.enabled          true
spark.eventLog.dir               hdfs://hadoop161:8020/spark-standalone

#分发同步
xsync /opt/module/spark-3.0.0-standalone
#启动 on hadoop163
sbin/start-all.sh
#http://hadoop163:18088
#启动历史服务器 hadoop163
sbin/start-history-server.sh
#http://hadoop163:18080 

#提交
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://hadoop163:7077 \
--executor-memory 4G \
--total-executor-cores 8 \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10

#bin/spark-submit \
--class <main-class>
--master <master-url> \
... # other options
<application-jar> \
[application-arguments]

#--class	Spark程序中包含主函数的类	
#--master	Spark程序运行的模式	本地模式:local[*]、spark://hadoop163:7077、Yarn
#--executor-memory 4G	指定每个executor可用内存为1G	符合集群内存配置即可,具体情况具体分析。
#--total-executor-cores 8	指定所有executor使用的cpu核数为2个	
#application-jar	打包好的应用jar,包含依赖。这个URL在集群中全局可见。 比如hdfs:// 共享存储系统,如果是file:// path,那么所有的节点的path都包含同样的jar	
#application-arguments	传给main()方法的参数	

19.2.3 Standalone-HA

ref

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#首先停掉所有Standalone集群 hadoop163
./sbin/stop-history-server.sh 
./sbin/stop-all.sh 
#
cp -av spark-3.0.0-standalone spark-3.0.0-standalone-ha

cd spark-3.0.0-standalone-ha
#vim spark-env.sh

注释掉如下内容:
#SPARK_MASTER_HOST=hadoop163
#SPARK_MASTER_PORT=7077

#添加上如下内容。配置由Zookeeper管理Master,在Zookeeper节点中自动创建/spark目录,用于管理:
export SPARK_DAEMON_JAVA_OPTS="
-Dspark.deploy.recoveryMode=ZOOKEEPER 
-Dspark.deploy.zookeeper.url=hadoop161,hadoop162,hadoop163 
-Dspark.deploy.zookeeper.dir=/spark-3.0.0-standalone-ha"

#启动 hadoop161
sbin/start-all.sh
#hadoop162 & hadoop163
sbin/start-master.sh

#http://hadoop161:18088/ ative
#http://hadoop162:18088/ standby
#http://hadoop163:18088/ standby

#Spark HA集群访问
bin/spark-shell \
--master spark://hadoop161:7077,hadoop162:7077,hadoop163:7077 \
--executor-memory 4g \
--total-executor-cores 8
#执行WordCount程序
scala>sc.textFile("hdfs://hadoop161:8020/input").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect
#Spark有standalone-client和standalone-cluster两种模式,主要区别在于:Driver程序的运行节点。

#--deploy-mode client
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://hadoop161:7077,hadoop162:7077,hadoop163:7077  \
--executor-memory 4G \
--total-executor-cores 8 \
--deploy-mode client \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10

#--deploy-mode cluster
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master spark://hadoop161:7077,hadoop162:7077,hadoop163:7077  \
--executor-memory 4G \
--total-executor-cores 8 \
--deploy-mode cluster \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10

#http://hadoop161:18088/
#Completed Drivers --> Worker --> stdout

19.2.4 Yarn模式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#复制新的目录
cp -av /opt/module/spark-3.0.0-bin-hadoop3.2/ /opt/module/spark-3.0.0-yarn
#hadoop yarn-site.xml 之前已经配置了,这里可以跳过
<!--是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
<property>
     <name>yarn.nodemanager.pmem-check-enabled</name>
     <value>false</value>
</property>

<!--是否启动一个线程检查每个任务正使用的虚拟内存量,如果任务超出分配值,则直接将其杀掉,默认是true -->
<property>
     <name>yarn.nodemanager.vmem-check-enabled</name>
     <value>false</value>
</property>
#修改配置文件
mv spark-env.sh.template spark-env.sh
#vim spark-env.sh
YARN_CONF_DIR=/opt/module/hadoop-3.1.3/etc/hadoop
export SPARK_HISTORY_OPTS="
-Dspark.history.ui.port=18080 
-Dspark.history.fs.logDirectory=hdfs://hadoop161:8020/spark-yarn
-Dspark.history.retainedApplications=30"

mv spark-defaults.conf.template spark-defaults.conf
#vim spark-defaults.conf
spark.eventLog.enabled          true
spark.eventLog.dir               hdfs://hadoop161:8020/spark-yarn
spark.yarn.historyServer.address=hadoop163:18080
spark.history.ui.port=18080
#创建hdfs
hadoop fs -mkdir /spark-yarn
#分发同步,如果只在hadoop161上提交可以不分发
xsync /opt/module/spark-3.0.0-yarn
#启动历史服务器 hadoop163
./sbin/start-history-server.sh 
#提交任务
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn  \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10
#hadoop162:8088 yarn
#Spark有yarn-client和yarn-cluster两种模式,主要区别在于:Driver程序的运行节点。
#yarn-client:Driver程序运行在客户端,适用于交互、调试,希望立即看到app的输出。
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn  \
--deploy-mode client \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10
#yarn-cluster:Driver程序运行在由ResourceManager启动的APPMaster适用于生产环境。
./bin/spark-submit \
--class org.apache.spark.examples.SparkPi \
--master yarn  \
--deploy-mode cluster \
./examples/jars/spark-examples_2.12-3.0.0.jar \
10
#http://hadoop162:8088/cluster
#History --> Executors --> stdout
#如果在 yarn 日志端无法查看到具体的日志, 则在yarn-site.xml中添加如下配置并启动Yarn历史服务器
<!-- 设置日志聚集服务器地址 -->
<property>  
    <name>yarn.log.server.url</name>  
    <value>http://hadoop161:19888/jobhistory/logs</value>
</property>

<!-- 设置日志保留时间为7天 -->
<property>
    <name>yarn.log-aggregation.retain-seconds</name>
    <value>604800</value>
</property>

19.2.5 spark sql

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
# conf
cp /opt/module/apache-hive-3.1.2/conf/hive-site.xml /opt/module/spark-3.0.0-yarn/conf/
cp /opt/module/hadoop-3.1.3/etc/hadoop/core-site.xml /opt/module/spark-3.0.0-yarn/conf/
cp /opt/module/hadoop-3.1.3/etc/hadoop/hdfs-site.xml /opt/module/spark-3.0.0-yarn/conf/
cp /opt/module/apache-hive-3.1.2/lib/mysql-connector-java-5.1.49-bin.jar /opt/module/spark-3.0.0-yarn/jars/
#lzo
cp /opt/module/hadoop-3.1.3/share/hadoop/common/hadoop-lzo-0.4.21.jar /opt/module/spark-3.0.0-yarn/jars/
#sync
xsync /opt/module/spark-3.0.0-yarn/
#
hive --service metastore &
./bin/saprk-sql --master yarn

19.2.6 thriftserver

Spark Thrift Server是Spark社区基于HiveServer2实现的一个Thrift服务。旨在无缝兼容HiveServer2。因为Spark Thrift Server的接口和协议都和HiveServer2完全一致,因此我们部署好Spark Thrift Server后,可以直接使用hive的beeline访问Spark Thrift Server执行相关语句。Spark Thrift Server的目的也只是取代HiveServer2,因此它依旧可以和Hive Metastore进行交互,获取到hive的元数据。

conf

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#手动分配
start-thriftserver.sh \
--master yarn \
--driver-memory 2G \
--executor-memory 2G \
--num-executors 2 \
--executor-cores 2 \
--hiveconf hive.server2.thrift.bind.host=hadoop161 \
--hiveconf hive.server2.thrift.port=20000
#动态分配
start-thriftserver.sh \
--executor-memory 20g \
--executor-cores 5 \
--driver-memory 10g \
--driver-cores 5 \
--conf spark.dynamicAllocation.enabled=true \
--conf spark.shuffle.service.enabled=true \
--conf spark.dynamicAllocation.initialExecutors=20 \
--conf spark.dynamicAllocation.minExecutors=20 \
--conf spark.dynamicAllocation.maxExecutors=400 \
--conf spark.dynamicAllocation.executorIdleTimeout=300s \
--conf spark.dynamicAllocation.schedulerBacklogTimeout=10s \
--conf spark.speculation=true \
--conf spark.speculation.interval=2s \
--conf spark.speculation.multiplier=10 \
--conf spark.speculation.quantile=0.9 \
--hiveconf hive.server2.global.init.file.location=$SPARK_CONF_DIR \
#--hiveconf hive.server2.thrift.bind.host=`hostname -i` \
--hiveconf hive.server2.thrift.bind.host=hadoop161 \
--hiveconf hive.server2.thrift.port=20000

sp.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/bin/bash

if (($#==0))
then
    echo -e "\033[31m start, stop or status \033[0m"
    exit 1;
fi
export HIVE_SERVER2_THRIFT_PORT=20000
export HIVE_SERVER2_THRIFT_BIND_HOST=hadoop161
sp=/opt/module/spark-3.0.0-yarn
case $1 in
"start")
    echo -e "\033[32m statring spark yarn sql server\033[0m"
    source /opt/module/env.sh && cd $sp;sbin/start-thriftserver.sh --master yarn --hiveconf hive.server2.thrift.bind.host=hadoop161 --hiveconf hive.server2.thrift.port=10000
    ;;
"stop")
    echo -e "\033[31m stopping spark yarn sql server \033[0m"
    source /opt/module/env.sh && cd $sp;sbin/stop-thriftserver.sh
    ;;
"status")
    status=$(jps | grep SparkSubmit | grep -v grep | wc -l)
    if (($status==1))
    then
        echo -e "\033[32m spark yarn sql server is running\033[0m"
    else
        echo -e "\033[31m spark yarn sql server is not running!!! \033[0m"
    fi
    ;;
*)
        echo -e "\033[31m start, stop or status \033[0m"
    ;;
esac

注意:启动spark thriftserver 之前需要启动hive metastore,并且目前测试hive 3.1.2自带的beeline和spark3.0.0的thriftserver 不兼容,测试hive 1.2.1的beeline和spark3.0.0自带的beeline 2.3.7测试ok。

如果测试脚本传递的端口绑定不能改变,最好把spark里面的hive-site.xml修改成最少配置的,我理解的是export环境变量优先级最高,然后是配置文件的环境变量,最后才是传入的环境变量。我的如下:

hive-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
<configuration>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://hadoop161:3306/metastore?useSSL=false</value>
    </property>

    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
    </property>

    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>root</value>
    </property>

    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>000000</value>
    </property>

    <property>
        <name>hive.cli.print.header</name>
        <value>true</value>
    </property>

    <property>
        <name>hive.cli.print.current.db</name>
        <value>true</value>
    </property>
</configuration>

20.1 下载解压

1
2
3
4
5
6
7
# https://flink.apache.org/downloads.html#flink
# https://archive.apache.org/dist/flink/

wget https://archive.apache.org/dist/flink/flink-1.12.0/flink-1.12.0-bin-scala_2.11.tgz
md5sum flink-1.12.0-bin-scala_2.11.tgz 
19347dcda2268dd20f4884ecc4dc9ed9  flink-1.12.0-bin-scala_2.11.tgz
tar zxvf flink-1.12.0-bin-scala_2.11.tgz -C /opt/module

20.2 部署安装

20.2.1 local 模式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
cp -av /opt/module/flink-1.12.0 /opt/module/flink-1.12.0-local
cd /opt/module/flink-1.12.0-local
./bin/start-cluster.sh
#http://hadoop161:8081
#batch
./bin/flink run examples/batch/WordCount.jar
#streaming
./bin/flink run examples/streaming/WordCount.jar --output ./wc.txt
#指定输入输出
./bin/flink run examples/streaming/WordCount.jar --input input.txt --output wc.txt
#可以在log日志查看执行结果
#也可以在WEB UI提交应用

20.2.2 standalone模式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
#Standalone模式又叫独立集群模式。
cp -av /opt/module/flink-1.12.0 /opt/module/flink-1.12.0-standalone
cd /opt/module/flink-1.12.0-standalone
#vim flink-conf.yaml
jobmanager.rpc.address: hadoop161
#vim workers
hadoop161
hadoop162
hadoop163
#分发
xsync /opt/module/flink-1.12.0-standalone
#启动
./bin/start-cluster.sh
#http://hadoop161:8081/#/overview
#streaming
./bin/flink run examples/streaming/WordCount.jar --output ./wc.txt

20.2.3 standalone-HA

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#首先停掉所有Standalone集群 hadoop161
./bin/stop-cluster.sh 
cp -av /opt/module/flink-1.12.0-standalone /opt/module/flink-1.12.0-standalone-ha
# flink-conf.yaml
high-availability: zookeeper
high-availability.storageDir: hdfs://hadoop161:8020/flink-1.12.0-standalone-ha
high-availability.zookeeper.quorum: hadoop161:2181,hadoop162:2181,hadoop163:2181
high-availability.zookeeper.path.root: /flink-1.12.0-standalone-ha
high-availability.cluster-id: /cluster_forsre
#vim masters
hadoop161:8081
hadoop162:8081
hadoop163:8081
#vim /opt/module/env.sh
export HADOOP_CLASSPATH=`hadoop classpath`
xsync  /etc/module/env.sh
#分发
xsync /opt/module/flink-1.12.0-standalone-ha
cd /opt/module/flink-1.12.0-standalone-ha
./bin/start-cluster.sh
#http://hadoop161:8081/
#http://hadoop162:8081/
#http://hadoop163:8081/
#zk
get /flink-1.12.0-standalone-ha/cluster_forsre/leader/rest_server_lock

20.2.4 yarn模式

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#将Flink应用提交给Yarn的ResourceManager, Yarn的ResourceManager会申请容器从Yarn的NodeManager上面. Flink会创建JobManager和TaskManager在这些容器上.Flink会根据运行在JobManger上的job的需要的slot的数量动态的分配TaskManager资源
cp -av /opt/module/flink-1.12.0-standalone /opt/module/flink-1.12.0-yarn
#vim /opt/module/env.sh
export HADOOP_CLASSPATH=`hadoop classpath`
xsync /opt/module/env.sh
#如果只在hadoop161上提交可以不分发
xsync /opt/module/flink-1.12.0-yarn
#Flink提供了yarn上运行的3模式,分别为Session-Cluster,Application Mode和Per-Job-Cluster模式。

#Session-Cluster
#Session-Cluster模式需要先启动Flink集群,向Yarn申请资源, 资源申请到以后,永远保持不变。以后提交任务都向这里提交。这个Flink集群会常驻在yarn集群中,除非手工停止。在向Flink集群提交Job的时候, 如果资源被用完了,则新的Job不能正常提交.
#缺点: 如果提交的作业中有长时间执行的大作业, 占用了该Flink集群的所有资源, 则后续无法提交新的job.
#所以, Session-Cluster适合那些需要频繁提交的多个小Job, 并且执行时间都不长的Job.
#1.	启动一个Flink-Session
./bin/yarn-session.sh -d 
#2. 执行任务
./bin/flink run examples/streaming/WordCount.jar
#3. 停止 id是上面启动的时候出现的
#In order to stop Flink gracefully, use the following command:
#$ echo "stop" | ./bin/yarn-session.sh -id application_1629729804836_0005
#If this should not be possible, then you can also kill Flink via YARN's web interface or via:
#$ yarn application -kill application_1629729804836_0005
#Note that killing Flink might not clean up all job artifacts and temporary files.
echo "stop" | ./bin/yarn-session.sh -id application_1629729804836_0005

#Application Mode
#Application Mode会在Yarn上启动集群, 应用jar包的main函数(用户类的main函数)将会在JobManager上执行. 只要应用程序执行结束, Flink集群会马上被关闭. 也可以手动停止集群.与Per-Job-Cluster的区别: 就是Application Mode下, 用户的main函数式在集群中执行的
#官方建议:出于生产的需求, 我们建议使用Per-job or Application Mode,因为他们给应用提供了更好的隔离!
#https://ci.apache.org/projects/flink/flink-docs-release-1.12/deployment/#deployment-modes
# 执行任务
./bin/flink run-application -t yarn-application examples/streaming/WordCount.jar
# 提交到队列
./bin/flink run-application -t yarn-application -Dyarn.application.queue hive examples/streaming/WordCount.jar

#Per-Job-Cluster
#一个Job会对应一个Flink集群,每提交一个作业会根据自身的情况,都会单独向yarn申请资源,直到作业执行完成,一个作业的失败与否并不会影响下一个作业的正常提交和运行。独享Dispatcher和ResourceManager,按需接受资源申请;适合规模大长时间运行的作业。
./bin/flink run -d -t yarn-per-job examples/streaming/WordCount.jar
#提交到其它队列
./bin/flink run -d -m yarn-application -yqu hive 
./bin/flink run -d -t yarn-per-job -Dyarn.application.queue hive

20.2.5 yarn-ha

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
#yarn模式的高可用和Standalone模式的高可用原理不一样。Standalone模式中, 同时启动多个Jobmanager, 一个为leader其他为standby的, 当leader挂了, 其他的才会有一个成为leader。yarn的高可用是同时只启动一个Jobmanager, 当这个Jobmanager挂了之后, yarn会再次启动一个, 其实是利用的yarn的重试次数来实现的高可用.
cp -av /opt/module/flink-1.12.0-yarn /opt/module/flink-1.12.0-yarn-ha

#yarn-site.xml
<property>
  <name>yarn.resourcemanager.am.max-attempts</name>
  <value>4</value>
  <description>
    The maximum number of application master execution attempts.
  </description>
</property>

#配置完分发和重启yarn
#flink-conf.yaml中配置
# yarn-site.xml中是它活的次数的上限, flink-conf.xml中的次数应该小于这个值。
yarn.application-attempts: 3
high-availability: zookeeper
high-availability.storageDir: hdfs://hadoop161:8020/flink-1.12.0-yarn-ha
high-availability.zookeeper.quorum: hadoop161:2181,hadoop162:2181,hadoop163:2181
high-availability.zookeeper.path.root: /flink-1.12.0-yarn-ha
#启动yarn-session
#kill Jobmanager测试

21.Debezium

22.Doris

23.Hudi

24.Iceberg

25.InfluxDB

26.Impala

27.Kudu

28.HUE

29.Canal

30.Maxwell

31.DataX

33.Kettle

34.CDH 6.3.2

35.Debezium

36.Nginx

1
2
3
4
5
6
7
8
# http://nginx.org/en/download.html
wget http://nginx.org/download/nginx-1.20.1.tar.gz
cd nginx-1.20.1/
sudo yum install pcre-devel zlib-devel openssl-devel -y
./configure --prefix=/opt/module/nginx-1.20.1 --with-http_ssl_module 
make -j 4 && make install
#非root启动1024以下端口
sudo setcap cap_net_bind_service=+eip /opt/module/nginx/sbin/nginx
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
–prefix= 指向安装目录
–sbin-path 指向(执行)程序文件(nginx)
–conf-path= 指向配置文件(nginx.conf)
–error-log-path= 指向错误日志目录
–pid-path= 指向pid文件(nginx.pid)
–lock-path= 指向lock文件(nginx.lock)(安装文件锁定,防止安装文件被别人利用,或自己误操作。)
–user= 指定程序运行时的非特权用户
–group= 指定程序运行时的非特权用户组
–builddir= 指向编译目录
–with-rtsig_module 启用rtsig模块支持(实时信号)
–with-select_module 启用select模块支持(一种轮询模式,不推荐在高载环境下使用)禁用:–without-select_module
–with-poll_module 启用poll模块支持(功能与select相同,与select特性相同,为一种轮询模式,不推荐在高载环境下使用)
–with-file-aio 启用file aio支持(一种APL文件传输格式)
–with-ipv6 启用ipv6支持
–with-http_ssl_module 启用ngx_http_ssl_module支持(使支持https请求,需已安装openssl)
–with-http_realip_module 启用ngx_http_realip_module支持(这个模块允许从请求标头更改客户端的IP地址值,默认为关)
–with-http_addition_module 启用ngx_http_addition_module支持(作为一个输出过滤器,支持不完全缓冲,分部分响应请求)
–with-http_xslt_module 启用ngx_http_xslt_module支持(过滤转换XML请求)
–with-http_image_filter_module 启用ngx_http_image_filter_module支持(传输JPEG/GIF/PNG 图片的一个过滤器)(默认为不启用。gd库要用到)
–with-http_geoip_module 启用ngx_http_geoip_module支持(该模块创建基于与MaxMind GeoIP二进制文件相配的客户端IP地址的ngx_http_geoip_module变量)
–with-http_sub_module 启用ngx_http_sub_module支持(允许用一些其他文本替换nginx响应中的一些文本)
–with-http_dav_module 启用ngx_http_dav_module支持(增加PUT,DELETE,MKCOL:创建集合,COPY和MOVE方法)默认情况下为关闭,需编译开启
–with-http_flv_module 启用ngx_http_flv_module支持(提供寻求内存使用基于时间的偏移量文件)
–with-http_gzip_static_module 启用ngx_http_gzip_static_module支持(在线实时压缩输出数据流)
–with-http_random_index_module 启用ngx_http_random_index_module支持(从目录中随机挑选一个目录索引)
–with-http_secure_link_module 启用ngx_http_secure_link_module支持(计算和检查要求所需的安全链接网址)
–with-http_degradation_module  启用ngx_http_degradation_module支持(允许在内存不足的情况下返回204或444码)
–with-http_stub_status_module 启用ngx_http_stub_status_module支持(获取nginx自上次启动以来的工作状态)
–without-http_charset_module 禁用ngx_http_charset_module支持(重新编码web页面,但只能是一个方向–服务器端到客户端,并且只有一个字节的编码可以被重新编码)
–without-http_gzip_module 禁用ngx_http_gzip_module支持(该模块同-with-http_gzip_static_module功能一样)
–without-http_ssi_module 禁用ngx_http_ssi_module支持(该模块提供了一个在输入端处理处理服务器包含文件(SSI)的过滤器,目前支持SSI命令的列表是不完整的)
–without-http_userid_module 禁用ngx_http_userid_module支持(该模块用来处理用来确定客户端后续请求的cookies)
–without-http_access_module 禁用ngx_http_access_module支持(该模块提供了一个简单的基于主机的访问控制。允许/拒绝基于ip地址)
–without-http_auth_basic_module禁用ngx_http_auth_basic_module(该模块是可以使用用户名和密码基于http基本认证方法来保护你的站点或其部分内容)
–without-http_autoindex_module 禁用disable ngx_http_autoindex_module支持(该模块用于自动生成目录列表,只在ngx_http_index_module模块未找到索引文件时发出请求。)
–without-http_geo_module 禁用ngx_http_geo_module支持(创建一些变量,其值依赖于客户端的IP地址)
–without-http_map_module 禁用ngx_http_map_module支持(使用任意的键/值对设置配置变量)
–without-http_split_clients_module 禁用ngx_http_split_clients_module支持(该模块用来基于某些条件划分用户。条件如:ip地址、报头、cookies等等)
–without-http_referer_module 禁用disable ngx_http_referer_module支持(该模块用来过滤请求,拒绝报头中Referer值不正确的请求)
–without-http_rewrite_module 禁用ngx_http_rewrite_module支持(该模块允许使用正则表达式改变URI,并且根据变量来转向以及选择配置。如果在server级 别设置该选项,那么他们将在 location之前生效。如果在location还有更进一步的重写规则,location部分的规则依然会被执行。如果这个URI重写是因为 location部分的规则造成的,那么 location部分会再次被执行作为新的URI。 这个循环会执行10次,然后Nginx会返回一个500错误。)
–without-http_proxy_module 禁用ngx_http_proxy_module支持(有关代理服务器)
–without-http_fastcgi_module 禁用ngx_http_fastcgi_module支持(该模块允许Nginx 与FastCGI 进程交互,并通过传递参数来控制FastCGI 进程工作。 )FastCGI一个常驻型的公共网关接口。
–without-http_uwsgi_module 禁用ngx_http_uwsgi_module支持(该模块用来医用uwsgi协议,uWSGI服务器相关)
–without-http_scgi_module 禁用ngx_http_scgi_module支持(该模块用来启用SCGI协议支持,SCGI协议是CGI协议的替代。它是一种应用程序与HTTP服务接口标准。它有些像FastCGI但他的设计 更容易实现。)
–without-http_memcached_module 禁用ngx_http_memcached_module支持(该模块用来提供简单的缓存,以提高系统效率)
-without-http_limit_zone_module 禁用ngx_http_limit_zone_module支持(该模块可以针对条件,进行会话的并发连接数控制)
–without-http_limit_req_module 禁用ngx_http_limit_req_module支持(该模块允许你对于一个地址进行请求数量的限制用一个给定的session或一个特定的事件)
–without-http_empty_gif_module 禁用ngx_http_empty_gif_module支持(该模块在内存中常驻了一个1*1的透明GIF图像,可以被非常快速的调用)
–without-http_browser_module 禁用ngx_http_browser_module支持(该模块用来创建依赖于请求报头的值。如果浏览器为modern ,则$modern_browser等于modern_browser_value指令分配的值;如 果浏览器为old,则$ancient_browser等于 ancient_browser_value指令分配的值;如果浏览器为 MSIE中的任意版本,则 $msie等于1)
–without-http_upstream_ip_hash_module 禁用ngx_http_upstream_ip_hash_module支持(该模块用于简单的负载均衡)
–with-http_perl_module 启用ngx_http_perl_module支持(该模块使nginx可以直接使用perl或通过ssi调用perl)
–with-perl_modules_path= 设定perl模块路径
–with-perl= 设定perl库文件路径
–http-log-path= 设定access log路径
–http-client-body-temp-path= 设定http客户端请求临时文件路径
–http-proxy-temp-path= 设定http代理临时文件路径
–http-fastcgi-temp-path= 设定http fastcgi临时文件路径
–http-uwsgi-temp-path= 设定http uwsgi临时文件路径
–http-scgi-temp-path= 设定http scgi临时文件路径
-without-http 禁用http server功能
–without-http-cache 禁用http cache功能
–with-mail 启用POP3/IMAP4/SMTP代理模块支持
–with-mail_ssl_module 启用ngx_mail_ssl_module支持
–without-mail_pop3_module 禁用pop3协议(POP3即邮局协议的第3个版本,它是规定个人计算机如何连接到互联网上的邮件服务器进行收发邮件的协议。是因特网电子邮件的第一个离 线协议标 准,POP3协议允许用户从服务器上把邮件存储到本地主机上,同时根据客户端的操作删除或保存在邮件服务器上的邮件。POP3协议是TCP/IP协议族中 的一员,主要用于 支持使用客户端远程管理在服务器上的电子邮件)
–without-mail_imap_module 禁用imap协议(一种邮件获取协议。它的主要作用是邮件客户端可以通过这种协议从邮件服务器上获取邮件的信息,下载邮件等。IMAP协议运行在 TCP/IP协议之上, 使用的端口是143。它与POP3协议的主要区别是用户可以不用把所有的邮件全部下载,可以通过客户端直接对服务器上的邮件进行操作。)
–without-mail_smtp_module 禁用smtp协议(SMTP即简单邮件传输协议,它是一组用于由源地址到目的地址传送邮件的规则,由它来控制信件的中转方式。SMTP协议属于 TCP/IP协议族,它帮助每台计算机在发送或中转信件时找到下一个目的地。)
–with-google_perftools_module 启用ngx_google_perftools_module支持(调试用,剖析程序性能瓶颈)
–with-cpp_test_module 启用ngx_cpp_test_module支持
–add-module= 启用外部模块支持
–with-cc= 指向C编译器路径
–with-cpp= 指向C预处理路径
–with-cc-opt= 设置C编译器参数(PCRE库,需要指定–with-cc-opt=”-I /usr/local/include”,如果使用select()函数则需要同时增加文件描述符数量,可以通过–with-cc- opt=”-D FD_SETSIZE=2048”指定。)
–with-ld-opt= 设置连接文件参数。(PCRE库,需要指定–with-ld-opt=”-L /usr/local/lib”。)
–with-cpu-opt= 指定编译的CPU,可用的值为: pentium, pentiumpro, pentium3, pentium4, athlon, opteron, amd64, sparc32, sparc64, ppc64
–without-pcre 禁用pcre库
–with-pcre 启用pcre库
–with-pcre= 指向pcre库文件目录
–with-pcre-opt= 在编译时为pcre库设置附加参数
–with-md5= 指向md5库文件目录(消息摘要算法第五版,用以提供消息的完整性保护)
–with-md5-opt= 在编译时为md5库设置附加参数
–with-md5-asm 使用md5汇编源
–with-sha1= 指向sha1库目录(数字签名算法,主要用于数字签名)
–with-sha1-opt= 在编译时为sha1库设置附加参数
–with-sha1-asm 使用sha1汇编源
–with-zlib= 指向zlib库目录
–with-zlib-opt= 在编译时为zlib设置附加参数
–with-zlib-asm= 为指定的CPU使用zlib汇编源进行优化,CPU类型为pentium, pentiumpro
–with-libatomic 为原子内存的更新操作的实现提供一个架构
–with-libatomic= 指向libatomic_ops安装目录
–with-openssl= 指向openssl安装目录
–with-openssl-opt 在编译时为openssl设置附加参数
–with-debug 启用debug日志
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
http{
   ..........
    upstream logserver{
      server    hadoop161:8080 weight=1;  
      server    hadoop162:8080 weight=1;
      server    hadoop163:8080 weight=1;
 
    }
    server {
        listen       80;
        server_name  logserver;
 
        location / {
            root   html;
            index  index.html index.htm;
            proxy_pass http://logserver;
            proxy_connect_timeout 10;
 
         }
   ..........
}

37.Redis

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# https://download.redis.io/releases/
wget https://download.redis.io/releases/redis-3.2.5.tar.gz
md5sum redis-3.2.5.tar.gz 
d3d2b4dd4b2a3e07ee6f63c526b66b08  redis-3.2.5.tar.gz
tar -zxvf redis-3.2.5.tar.gz
cd redis-3.2.5
sudo yum install gcc gcc-c++ -y
make -j 4 && make PREFIX=/opt/module/redis-3.2.5 install

#conf
mkdir /opt/module/redis-3.2.5/conf/
mkdir /opt/module/redis-3.2.5/data/
cp /opt/soft/redis-3.2.5/redis.conf /opt/module/redis-3.2.5/conf/
cd /opt/module/redis-3.2.5/conf/
#vim redis.conf
daemonize no -> yes
bind 127.0.0.1 -> bind 0.0.0.0
protected-mode yes -> protected-mode no
dir ./ -> dir /opt/module/redis-3.2.5/data/

#显示中文或者按数据原有格式打印数据,不展示额外的类型信息
redis-cli  --raw

rs.sh

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/bash

redis_home=/opt/module/redis-3.2.5
if (($#==0))
then
    echo -e "\033[31m start, stop or status \033[0m"
    exit 1;
fi
case $1 in
"start")
    echo -e "\033[32m statring redis server\033[0m"
    $redis_home/bin/redis-server $redis_home/conf/redis.conf
    ;;
"stop")
    echo -e "\033[31m stopping redis server \033[0m"
    $redis_home/bin/redis-cli shutdown
    ;;
"status")
    status=$(ps -ef | grep redis-server | grep -v grep | wc -l)
    if (($status==1))
    then
        echo -e "\033[32m redis server is running\033[0m"
    else
        echo -e "\033[31m redis server is not running!!! \033[0m"
    fi
    ;;
*)
        echo -e "\033[31m start, stop or status \033[0m"
    ;;
esac