Data Base/Hadoop, HDFS

Hadoop 실행

잇꼬 2024. 3. 12. 00:49
728x90
반응형
SMALL

#) host 추가
[root@centos ~]# vi /etc/hosts

192.168.56.10 centos


#) 방화벽 해제 및 리스트 확인
[root@centos ~]# iptables -F
[root@centos ~]# iptables -L

#) hostname 확인 및 변경
[root@centos ~]# hostnamectl
[root@centos ~]# hostnamectl set-hostnmae centos

#) JDK 설정
[root@centos ~]# mkdir -p /usr/java
[root@centos ~]# ls

anaconda-ks.cfg  Downloads                   Music     Templates
Desktop          initial-setup-ks.cfg        Pictures  Videos
Documents        jdk-8u131-linux-x64.tar.gz  Public

 

[root@centos ~]# mv jdk-8u131-linux-x64.tar.gz /usr/java

#) 확인
[root@centos ~]# cd /usr/java
[root@centos java]# ls

jdk-8u131-linux-x64.tar.gz


#) 압축 풀기 후 확인
[root@centos java]# tar xvfz jdk-8u131-linux-x64.tar.gz
[root@centos java]# ls

jdk1.8.0_131  jdk-8u131-linux-x64.tar.gz


[root@centos java]# cd jdk1.8.0_131/
[root@centos jdk1.8.0_131]# pwd

/usr/java/jdk1.8.0_131


#) 환경 변수 설정
[root@centos java]# cd
[root@centos ~]# vi /etc/profile

...
export JAVA_HOME=/usr/java/jdk1.8.0_131
export PATH=$PATH:$JAVA_HOME/bin
export CLASS_PATH="."


#) 설정 후 확인
[root@centos ~]# source /etc/profile
[root@centos ~]# echo $JAVA_HOME

/usr/java/jdk1.8.0_131

[root@centos ~]# java -version

openjdk version "1.8.0_262"
OpenJDK Runtime Environment (build 1.8.0_262-b10)
OpenJDK 64-Bit Server VM (build 25.262-b10, mixed mode)


#) openjdk 에서 oraclejdk로 변경 
[root@centos ~]# which java

/usr/bin/java


#1) 기본 jdk 추가 
[root@centos ~]# update-alternatives --install "/usr/bin/java" "java" "/usr/java/jdk1.8.0_131/bin/java" 1

#2) 목록 확인 후 설정 변경
[root@centos ~]# update-alternatives --config java

There are 4 programs which provide 'java'.

  Selection    Command
-----------------------------------------------
   1           java-1.7.0-openjdk.x86_64 (/usr/lib/jvm/java-1.7.0-openjdk-1.7.0.261-2.6.22.2.el7_8.x86_64/jre/bin/java)
*  2(기본값)    java-1.8.0-openjdk.x86_64 (/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.262.b10-1.el7.x86_64/jre/bin/java)
 + 3           /usr/java/jdk1.8.0_131
   4           /usr/java/jdk1.8.0_131/bin/java ← 변경하기

Enter to keep the current selection[+], or type selection number: 4 # (변경할 위치)



#) java 버전 확인
[root@centos ~]# java -version

java version "1.8.0_131"
Java(TM) SE Runtime Environment (build 1.8.0_131-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.131-b11, mixed mode)


#) 설정 지우기
[root@centos ~]# update-alternatives --remove java /usr/java/jdk1.8.0_131

#) hadoop 그룹 생성 후 확인
[root@centos ~]# groupadd hadoop
[root@centos ~]# tail /etc/group

sshd:x:74:
slocate:x:21:
avahi:x:70:
postdrop:x:90:
postfix:x:89:
tcpdump:x:72:
ora:x:1000:ora
vboxsf:x:981:
vboxdrmipc:x:980:
hadoop:x:1001:


#) hadoop 유저 생성 
[root@centos ~]# useradd -g hadoop hadoop
[root@centos ~]# tail /etc/passwd

...
ora:x:1000:1000:ora:/home/ora:/bin/bash
vboxadd:x:987:1::/var/run/vboxadd:/bin/false
hadoop:x:1001:1001::/home/hadoop:/bin/bash


#) hadoop 유저 패스워드 설정
[root@centos ~]# passwd hadoop

Changing password for user hadoop.
New password:(1234: 설정)
BAD PASSWORD: The password is shorter than 8 characters
Retype new password:(1234)
passwd: all authentication tokens updated successfully.


#) 접속
[root@centos ~]# su - hadoop

# 하둡 프로그램을 hadoop 유저에게 전송
<hadoop sess>


#) 하둡 확인
[hadoop@centos ~]$ ls

hadoop-3.2.4.tar.gz


#) 압축 풀기, 확인
[hadoop@centos ~]$ tar xvzf hadoop-3.2.4.tar.gz
[hadoop@centos ~]$ ls

hadoop-3.2.4  hadoop-3.2.4.tar.gz

[hadoop@centos ~]$ cd hadoop-3.2.4/
[hadoop@centos hadoop-3.2.4]$ pwd

/home/hadoop/hadoop-3.2.4


#) 하둡 환경 설정
[hadoop@centos ~]$ vi .bashrc

...
export JAVA_HOME=/usr/java/jdk1.8.0_131
export HADOOP_HOME=/home/hadoop/hadoop-3.2.4
export HADOOP_CONFIG_HOME=$HADOOP_HOME/etc/hadoop
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin


[hadoop@centos ~]$ source .bashrc
[hadoop@centos ~]$ echo $HADOOP_HOME

/home/hadoop/hadoop-3.2.4



#) java 버전 확인
[hadoop@centos ~]$ java -version

java version "1.8.0_131"
Java(TM) SE Runtime Environment (build 1.8.0_131-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.131-b11, mixed mode)


#) 하둡 버전 확인
[hadoop@centos ~]$ hadoop version

Hadoop 3.2.4
Source code repository Unknown -r 7e5d9983b388e372fe640f21f048f2f2ae6e9eba
Compiled by ubuntu on 2022-07-12T11:58Z
Compiled with protoc 2.5.0
From source with checksum ee031c16fe785bbb35252c749418712
This command was run using /home/hadoop/hadoop-3.2.4/share/hadoop/common/hadoop-common-3.2.4.jar


#) 공개키 생성
[hadoop@centos ~]$ rm -rf .ssh

[hadoop@centos ~]$ ssh-keygen

Generating public/private rsa key pair.
Enter file in which to save the key (/home/hadoop/.ssh/id_rsa): (엔터키)
Created directory '/home/hadoop/.ssh'.
Enter passphrase (empty for no passphrase):(엔터키)
Enter same passphrase again:
Your identification has been saved in /home/hadoop/.ssh/id_rsa.
Your public key has been saved in /home/hadoop/.ssh/id_rsa.pub.
The key fingerprint is:
SHA256:63EWNqV222giNXlGtkE1BKX2ZQY5C+R2U1eXQrzEQAU hadoop@centos
The key's randomart image is:
+---[RSA 2048]----+
|          .E@B*.B|
|           o.*o=o|
|            @.=o+|
|           O *.= |
|        S X = .  |
|         = B +   |
|        + + + .  |
|       . = o     |
|        .        |
+----[SHA256]-----+


#) 공개키 입력
[hadoop@centos ~]$ ssh-copy-id -i /home/hadoop/.ssh/id_rsa.pub hadoop@192.168.56.10

/bin/ssh-copy-id: INFO: Source of key(s) to be installed: "/home/hadoop/.ssh/id_rsa.pub"
The authenticity of host '192.168.56.10 (192.168.56.10)' can't be established.
ECDSA key fingerprint is SHA256:bYsd9x46qx8MnQT7Mrvp/9XiyzE3sXnm2Gh1Jhg4GOQ.
ECDSA key fingerprint is MD5:99:a1:31:6c:7a:79:48:ab:26:74:a7:31:6d:27:06:62.
Are you sure you want to continue connecting (yes/no)? yes(입력)
/bin/ssh-copy-id: INFO: attempting to log in with the new key(s), to filter out any that are already installed
/bin/ssh-copy-id: INFO: 1 key(s) remain to be installed -- if you are prompted now it is to install the new keys
hadoop@192.168.56.10's password:(1234 입력)

Number of key(s) added: 1

Now try logging into the machine, with:   "ssh 'hadoop@192.168.56.10'"
and check to make sure that only the key(s) you wanted were added.



#) 패스워드 입력하지 않는지 확인
[hadoop@centos ~]$ ssh hadoop@192.168.56.10

Last login: Mon Mar 11 12:01:16 2024



#) 공개키 빠져 나오는 방법 
[hadoop@centos ~]$ exit

logout
Connection to 192.168.56.10 closed.



#) 공개키 접속하는 방법
[hadoop@centos ~]$ ssh hadoop@192.168.56.10

Last login: Mon Mar 11 12:16:42 2024 from centos


#1) 하둡 환경 설정_경로 확인
[hadoop@centos ~]$ cd $HADOOP_HOME/etc/hadoop
[hadoop@centos hadoop]$ pwd

/home/hadoop/hadoop-3.2.4/etc/hadoop

[hadoop@centos hadoop]$ ls

capacity-scheduler.xml  hadoop-env.sh                     httpfs-env.sh            kms-env.sh            mapred-env.sh               ssl-server.xml.example         yarnservice-log4j.properties
configuration.xsl       hadoop-metrics2.properties        httpfs-log4j.properties  kms-log4j.properties  mapred-queues.xml.template  user_ec_policies.xml.template  yarn-site.xml
container-executor.cfg  hadoop-policy.xml                 httpfs-signature.secret  kms-site.xml          mapred-site.xml             workers
core-site.xml           hadoop-user-functions.sh.example  httpfs-site.xml          log4j.properties      shellprofile.d              yarn-env.cmd
hadoop-env.cmd          hdfs-site.xml                     kms-acls.xml             mapred-env.cmd        ssl-client.xml.example      yarn-env.sh


#2) 하둡 환경 설정 후 저장
[hadoop@centos hadoop]$ vi hadoop-env.sh

...
export JAVA_HOME=/usr/java/jdk1.8.0_131
export HADOOP_HOME=/home/hadoop/hadoop-3.2.4


#3) ip 주소 설정_새로운 파일로 생성
[hadoop@centos hadoop]$ vi masters

192.168.56.10


#4) data node 설정 _ip 주소 입력
[hadoop@centos hadoop]$ vi workers

# localhost(삭제)
192.168.56.10


#5) 추가 설정
[hadoop@centos hadoop]$ vi core-site.xml

...
<configuration>
 <property>
  <name>fs.defaultFS</name>
  <value>hdfs://centos:9010</value>
 </property>
 <property>
  <name>hadoop.tmp.dir</name>
  <value>/home/hadoop/hadoop-3.2.4/tmp</value>
 </property>
</configuration>
:wq(저장)


#6) 하둡 파일 접근 설정
[hadoop@centos hadoop]$ vi hdfs-site.xml

...
<configuration>
 <property>
  <name>dfs.replication</name>
  <value>1</value> # 3이 기본값
 </property>
 <property>
  <name>dfs.namenode.name.dir</name>
  <value>/home/hadoop/data/dfs/namenode</value>
 </property>
 <property>
  <name>dfs.namenode.checkpoint.dir</name>
  <value>/home/hadoop/data/dfs/namesecondary</value>
 </property>
 <property>
  <name>dfs.datanode.data.dir</name>
  <value>/home/hadoop/data/dfs/datanode</value>
 </property>
 <property>
  <name>dfs.http.address</name>
  <value>centos:50070</value>
 </property>
 <property>
  <name>dfs.secondary.http.address</name>
  <value>centos:50090</value>
 </property>
</configuration>
:wq(저장)


#7) 분산 데이터처리 설정 
[hadoop@centos hadoop]$ vi mapred-site.xml

...
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>yarn.app.mapreduce.am.env</name>
        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
    </property>
    <property>
        <name>mapreduce.map.env</name>
        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
    </property>
    <property>
        <name>mapreduce.reduce.env</name>
        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
    </property>
</configuration>
:wq(저장)


#8) 클러스터 리소스 관리 및 작업 스케줄링 등 설정 
[hadoop@server01 hadoop]$ vi yarn-site.xml

...
<configuration>
 <property>
  <name>yarn.nodemanager.aux-services</name>
  <value>mapreduce_shuffle</value>
 </property>
 <property>
  <name>yarn.nodemanager.aux-services.mapreduce_suffle.class</name>
  <value>org.apache.hadoop.mapred.ShuffleHandler</value>
 </property>
 <property>
  <name>yarn.nodemanager.local-dirs</name>
  <value>/home/hadoop/data/yarn/nm-local-dir</value>
 </property>
 <property>
  <name>yarn.resourcemanager.fs.state-store.uri</name>
  <value>/home/hadoop/data/yarn/system/rmstore</value>
 </property>
 <property>
  <name>yarn.resourcemanager.hostname</name>
  <value>centos</value>
 </property>
 <property>
  <name>yarn.web-proxy.address</name>
  <value>0.0.0.0:8089</value>
 </property>yes
</configuration>
:wq(저장)


#) 아파치 하둡 yarn 환경 변수 설정
[hadoop@centos hadoop]$ vi yarn-env.sh

...
JAVA=$JAVA_HOME/bin/java
JAVA_HEAP_MAX=-Xmx1000m
:wq(저장)

 


# 하둡 파일 포멧
#) 하둡 스토리지 작업_포멧 작업
[hadoop@centos ~]$ hdfs namenode -format

WARNING: /home/hadoop/hadoop-3.2.4/logs does not exist. Creating.
2024-03-11 14:05:40,764 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG:   host = centos/192.168.56.10
STARTUP_MSG:   args = [-format]
STARTUP_MSG:   version = 3.2.4
...
2024-03-11 14:05:42,325 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at centos/192.168.56.10
************************************************************/



# 하둡 데몬 시작
#) 데몬 올리기
[hadoop@centos ~]$ start-all.sh

WARNING: Attempting to start all Apache Hadoop daemons as hadoop in 10 seconds.
WARNING: This is not a recommended production deployment configuration.
WARNING: Use CTRL-C to abort.
Starting namenodes on [centos]
centos: Warning: Permanently added 'centos' (ECDSA) to the list of known hosts.
Starting datanodes
Starting secondary namenodes [centos]
Starting resourcemanager
Starting nodemanagers
0.0.0.0: Warning: Permanently added '0.0.0.0' (ECDSA) to the list of known hosts.


# 시작된 데몬 확인
#) 데몬 확인
[hadoop@centos ~]$ jps

17170 DataNode ★
18114 Jps
17571 ResourceManager ★
17028 NameNode ★
17353 SecondaryNameNode ★
18089 WebAppProxyServer
17691 NodeManager ★


# 하둡 데몬 종료 
[hadoop@centos ~]$ stop-all.sh

# 하둡 파일 체크 
[hadoop@centos ~]$ hdfs dfs -ls /
(아무것도 없음)

# 디렉터리 생성 
[hadoop@centos ~]$ hdfs dfs -mkdir /user

# 하둡 파일 체크 
[hadoop@centos ~]$ hdfs dfs -ls /

Found 1 items
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 14:12 /user


#) 전송(FileZilla) 후 파일 확인(winter.txt)
[hadoop@centos ~]$ ls

data  hadoop-3.2.4  hadoop-3.2.4.tar.gz  winter.txt


# 로컬 파일 시스템의 파일을 hdfs 에 복사
[hadoop@centos ~]$ hdfs dfs -put /home/hadoop/winter.txt /user
[hadoop@centos ~]$ hdfs dfs -ls /user

Found 1 items
-rw-r--r--   1 hadoop supergroup     114545(용량) 2024-03-11 14:18 /user/winter.txt


#) 디렉토리 이동
[hadoop@centos ~]$ cd $HADOOP_HOME/share/hadoop/mapreduce
[hadoop@centos mapreduce]$ pwd

/home/hadoop/hadoop-3.2.4/share/hadoop/mapreduce


#) 파일 확인

- hadoop-mapreduce-examples-3.2.4.jar: 워드 카운트 파일
[hadoop@centos mapreduce]$ ls

hadoop-mapreduce-client-app-3.2.4.jar     hadoop-mapreduce-client-hs-3.2.4.jar          hadoop-mapreduce-client-jobclient-3.2.4-tests.jar  hadoop-mapreduce-client-uploader-3.2.4.jar  lib
hadoop-mapreduce-client-common-3.2.4.jar  hadoop-mapreduce-client-hs-plugins-3.2.4.jar  hadoop-mapreduce-client-nativetask-3.2.4.jar       hadoop-mapreduce-examples-3.2.4.jar         lib-examples
hadoop-mapreduce-client-core-3.2.4.jar    hadoop-mapreduce-client-jobclient-3.2.4.jar   hadoop-mapreduce-client-shuffle-3.2.4.jar          jdiff                                       sources


#) 글자 카운트한 후에 확인 작업
[hadoop@centos mapreduce]$ yarn jar hadoop-mapreduce-examples-

3.2.4.jar wordcount /user/winter.txt output

[hadoop@centos mapreduce]$ hdfs dfs -ls output

Found 2 items
-rw-r--r--   1 hadoop supergroup          0 2024-03-11 14:28 output/_SUCCESS
-rw-r--r--   1 hadoop supergroup      48033 2024-03-11 14:27 output/part-r-00000


#) 확인해보기
tip) home 디렉터리에서 확인 가능
[hadoop@centos mapreduce]$ cd 
[hadoop@centos ~]$ hdfs dfs -cat output/part-r-00000 | tail -10

  In    1
  Let   1
  Love  1
  Reindeer(s)   1
  The   1
  bout  1
  enhance       1
  of-course-she-isn  t          1
  wakes 1
  yells         1


# HDFS 에 저장된 파일을 로컬 파일 시스템으로 복사 
[hadoop@centos ~]$ hdfs dfs -get output/part-r-00000 /home/hadoop/wc_output
[hadoop@centos ~]$ ls

data  hadoop-3.2.4  hadoop-3.2.4.tar.gz  wc_output  winter.txt

[hadoop@centos ~]$ vi wc_output

!       1
"Final  1
&       4
'CAUSE  2
(12)    2
(5)     1
(8)     1
(8),    1
(9)     7
(AS     7
...
:q!(확인만 하고 나올 것)


#) 모든 디렉터리 확인
[hadoop@centos ~]$ hdfs dfs -ls -R /

drwx------   - hadoop supergroup          0 2024-03-11 14:26 /tmp
drwx------   - hadoop supergroup          0 2024-03-11 14:26 /tmp/hadoop-yarn
drwx------   - hadoop supergroup          0 2024-03-11 14:26 /tmp/hadoop-yarn/staging
drwx------   - hadoop supergroup          0 2024-03-11 14:26 /tmp/hadoop-yarn/staging/hadoop
drwx------   - hadoop supergroup          0 2024-03-11 14:28 /tmp/hadoop-yarn/staging/hadoop/.staging
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 14:26 /tmp/hadoop-yarn/staging/history
drwxrwxrwt   - hadoop supergroup          0 2024-03-11 14:26 /tmp/hadoop-yarn/staging/history/done_intermediate
drwxrwx---   - hadoop supergroup          0 2024-03-11 14:28 /tmp/hadoop-yarn/staging/history/done_intermediate/hadoop
-rwxrwx---   1 hadoop supergroup      22808 2024-03-11 14:28 /tmp/hadoop-yarn/staging/history/done_intermediate/hadoop/job_1710133662349_0001-1710134780865-hadoop-word+count-1710134881333-1-1-SUCCEEDED-default-1710134829636.jhist
-rwxrwx---   1 hadoop supergroup        442 2024-03-11 14:28 /tmp/hadoop-yarn/staging/history/done_intermediate/hadoop/job_1710133662349_0001.summary
-rwxrwx---   1 hadoop supergroup     235647 2024-03-11 14:28 /tmp/hadoop-yarn/staging/history/done_intermediate/hadoop/job_1710133662349_0001_conf.xml
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 14:27 /user
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 14:27 /user/hadoop
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 14:28 /user/hadoop/output
-rw-r--r--   1 hadoop supergroup          0 2024-03-11 14:28 /user/hadoop/output/_SUCCESS
-rw-r--r--   1 hadoop supergroup      48033 2024-03-11 14:27 /user/hadoop/output/part-r-00000
-rw-r--r--   1 hadoop supergroup     114545 2024-03-11 14:18 /user/winter.txt


#) 특정한 디렉터리 확인
[hadoop@centos ~]$ hdfs dfs -ls -R /user/hadoop

drwxr-xr-x   - hadoop supergroup          0 2024-03-11 14:28 /user/hadoop/output
-rw-r--r--   1 hadoop supergroup          0 2024-03-11 14:28 /user/hadoop/output/_SUCCESS
-rw-r--r--   1 hadoop supergroup      48033 2024-03-11 14:27 /user/hadoop/output/part-r-00000


#) 디스크 사용량 체크 확인
[hadoop@centos ~]$ hdfs dfs -du

48033  48033  output

[hadoop@centos ~]$ hdfs dfs -du output

0      0      output/_SUCCESS
48033  48033  output/part-r-00000


#) 전체 확인 

-du: 디스크 사용량 

-s: 모든 하위 디렉토리 및 파일의 총 디스크 사용량을 요약해 표시 

[hadoop@centos ~]$ hdfs dfs -du -s

48033  48033  .


#) 파일 삭제
[hadoop@centos ~]$ hdfs dfs -rm /user/hadoop/output/_SUCCESS

Deleted /user/hadoop/output/_SUCCESS


#) 특정 디렉터리 삭제
[hadoop@centos ~]$ hdfs dfs -rm -r /user/hadoop/output

Deleted /user/hadoop/output


# 하이브 설치
#) hadoop 계정으로 파일(hive) 옮기기(FileZilla) 
[hadoop@centos ~]$ ls

apache-hive-3.1.3-bin.tar.gz  data  hadoop-3.2.4  hadoop-3.2.4.tar.gz  wc_output  winter.txt


#) 압축 풀기 
[hadoop@centos ~]$ tar xvzf apache-hive-3.1.3-bin.tar.gz
[hadoop@centos ~]$ ls

apache-hive-3.1.3-bin  apache-hive-3.1.3-bin.tar.gz  data  
hadoop-3.2.4  hadoop-3.2.4.tar.gz  wc_output  winter.txt

[hadoop@centos apache-hive-3.1.3-bin]$ pwd

/home/hadoop/apache-hive-3.1.3-bin


# 환경 설정
- 기존에 추가 설정
[hadoop@centos ~]$ vi .bashrc

...
export HIVE_HOME=/home/hadoop/apache-hive-3.1.3-bin
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin
:wq(저장)


#) 실행 후 확인
[hadoop@centos ~]$ source .bashrc
[hadoop@centos ~]$ echo $HIVE_HOME

/home/hadoop/apache-hive-3.1.3-bin


#) hive 환경 설정
[hadoop@centos ~]$ cd $HIVE_HOME/conf

#) 아파치 Hive의 설정. hive의 동작을 제어하고 사용자 정의 환경 변수를 설정. (새 파일)
[hadoop@centos conf]$ vi hive-site.xml

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
       <name>hive.metastore.warehouse.dir</name>
       <value>/user/hive/warehouse</value>
  </property>
  <property>
       <name>hive.cli.print.header</name>
       <value>true</value>
  </property>
</configuration>
:wq(저장)


#) hive의 구성 설정을 제어 설정

[hadoop@centos ~]$ vi $HIVE_HOME/bin/hive-config.sh

...
export HADOOP_HOME=/home/hadoop/hadoop-3.2.4


#) copy, move 하기
[hadoop@centos conf]$ cp $HADOOP_HOME/share/hadoop/common/lib/guava-27.0-jre.jar $HIVE_HOME/lib/guava-27.0-jre.jar
[hadoop@centos conf]$ mv $HIVE_HOME/lib/guava-19.0.jar $HIVE_HOME/lib/guava-19.0.jar.bak

#) 디렉터리 생성 후 확인
[hadoop@centos conf]$ cd
[hadoop@centos ~]$ hdfs dfs -mkdir -p /user/hive/warehouse
[hadoop@centos ~]$ hdfs dfs -ls -R /user/hive

drwxr-xr-x   - hadoop supergroup          0 2024-03-11 15:13 /user/hive/warehouse


#) 쓰기 권한 추가
[hadoop@centos ~]$ hdfs dfs -chmod g+w /user/hive/warehouse
[hadoop@centos ~]$ hdfs dfs -ls -R /user/hive

drwxrwxr-x   - hadoop supergroup          0 2024-03-11 15:13 /user/hive/warehouse


#) hive 접근
[hadoop@centos ~]$ schematool -dbType derby -initSchema

...
(공백 공간)
Initialization script completed
schemaTool completed


#) hive 접속 하기

[hadoop@centos ~]$ hive 

which: no hbase in (/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/usr/java/jdk1.8.0_131/bin:/usr/java/jdk1.8.0_131/bin:/home/hadoop/hadoop-3.2.4/bin:/home/hadoop/hadoop-3.2.4/sbin:/home/hadoop/.local/bin:/home/hadoop/bin:/usr/java/jdk1.8.0_131/bin:/home/hadoop/hadoop-3.2.4/bin:/home/hadoop/hadoop-3.2.4/sbin:/home/hadoop/apache-hive-3.1.3-bin/bin)
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/hadoop/apache-hive-3.1.3-bin/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/hadoop/hadoop-3.2.4/share/hadoop/common/lib/slf4j-reload4j-1.7.35.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Hive Session ID = bf2b12bc-433d-4237-a409-91ac7489195f

Logging initialized using configuration in jar:file:/home/hadoop/apache-hive-3.1.3-bin/lib/hive-common-3.1.3.jar!/hive-log4j2.properties Async: true
Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
Hive Session ID = 016d0d09-4002-460d-a03e-e7b09b6fb51b



#) 데이터베이스 확인
hive> show databases;

OK
database_name
default
Time taken: 0.606 seconds, Fetched: 1 row(s)


#) table 정보 확인
hive> show tables;

OK
tab_name
Time taken: 0.057 seconds


#) hadoop 계정으로 이동
dept.csv, emp.csv 이동

# new session open #
<hadoop sess>

[hadoop@centos ~]$ ls

apache-hive-3.1.3-bin  apache-hive-3.1.3-bin.tar.gz  data  dept.csv  derby.log  emp.csv  
hadoop-3.2.4  hadoop-3.2.4.tar.gz  metastore_db  wc_output  winter.txt

[hadoop@centos ~]$ ls emp.csv dept.csv
dept.csv  emp.csv

#) 파일 복사 후 확인
[hadoop@centos ~]$ hdfs dfs -put /home/hadoop/emp.csv /user
[hadoop@centos ~]$ hdfs dfs -put /home/hadoop/dept.csv /user
[hadoop@centos ~]$ hdfs dfs -ls /user

Found 5 items
-rw-r--r--   1 hadoop supergroup        635 2024-03-11 15:26 /user/dept.csv ★
-rw-r--r--   1 hadoop supergroup       8017 2024-03-11 15:26 /user/emp.csv ★
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 14:58 /user/hadoop
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 15:13 /user/hive
-rw-r--r--   1 hadoop supergroup     114545 2024-03-11 14:18 /user/winter.txt


[hadoop@centos ~]$  hdfs dfs -cat /user/emp.csv | head -2
[hadoop@centos ~]$  hdfs dfs -cat /user/dept.csv | head -2

#) hive 접속

create table emp (
empno int, 
fname string, 
lname string, 
phone string, 
hiredate string, 
job string, 
sal int, 
comm int, 
mgr int, 
deptno int) 
row format delimited 
fields terminated by ',' 
lines terminated by '\n' 
stored as textfile;


#) 테이블 확인
hive> show tables;

#) emp 확인
hive> desc emp;

#) 정보 확인
hive> select * from emp;

# 로컬 파일 → hive 전송
- 기존 데이터를 overwrite 지우고 입력
hive> load data local inpath '/home/hadoop/emp.csv' overwrite into table emp;
hive> select * from emp;

# append: 기존 데이터 제일 뒤에 추가
hive> load data local inpath '/home/hadoop/emp.csv' into table emp;
hive> select * from emp;

# 테이블 삭제
hive> drop talbe emp;

# 테이블 재생성 
hive> 

CREATE TABLE emp (
    empno INT,
    fname STRING,
    lname STRING,
    phone STRING,
    hiredate STRING,
    job STRING,
    sal INT,
    comm INT,
    mgr INT,
    deptno INT
    )
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;


hive> show tables;

# 하둡 → hive 로 데이터 전송
hive> load data inpath 'hdfs://centos:9010/user/emp.csv' overwrite into table emp;

Loading data to table default.emp
OK
Time taken: 1.861 seconds


#) 테이블 확인 

hive> select * from emp;
hive> drop table emp;

# hadoop session # (새로운 창)
[hadoop@centos ~]$ hdfs dfs -mkdir /user/hive/warehouse/emp
[hadoop@centos ~]$ hdfs dfs -ls -R /user/hive/

drwxrwxr-x   - hadoop supergroup          0 2024-03-11 16:07 /user/hive/warehouse
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 16:07 /user/hive/warehouse/emp

 

[hadoop@centos ~]$ hdfs dfs -put /home/hadoop/emp.csv /user/hive/warehouse/emp
[hadoop@centos ~]$ hdfs dfs -ls -R /user/hive/

drwxrwxr-x   - hadoop supergroup          0 2024-03-11 16:07 /user/hive/warehouse
drwxr-xr-x   - hadoop supergroup          0 2024-03-11 16:08 /user/hive/warehouse/emp
-rw-r--r--   1 hadoop supergroup       8017 2024-03-11 16:08 /user/hive/warehouse/emp/emp.csv


#) hive 접속 후 테이블 생성
#1) external table 생성

CREATE TABLE emp (
  empno INT,
  fname STRING,
  lname STRING,
  mail STRING,
  phone STRING,
  hiredate STRING,
  job STRING,
  sal INT,
  comm INT,
  mgr INT,
  deptno INT) 
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

 

CREATE external TABLE if not exists emp (
  empno INT,
  fname STRING,
  lname STRING,
  phone STRING,
  hiredate STRING,
  job STRING,
  sal INT,
  comm INT,
  mgr INT,
  deptno INT) 
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/hive/warehouse/emp'; <- 여기로(실제 경로) 저장


#) 테이블 확인
hive> show tables;

OK
tab_name
emp
Time taken: 0.046 seconds, Fetched: 1 row(s)


#) 조회 
hive> select * from emp;

OK
...
Time taken: 0.286 seconds, Fetched: 107 row(s)


#) 테이블 메타 정보 확인
hive> describe formatted emp;

OK
col_name        data_type       comment
# col_name              data_type               comment
empno                   int
fname                   string
lname                   string
phone                   string
hiredate                string
job                     string
sal                     int
comm                    int
mgr                     int
deptno                  int

# Detailed Table Information
Database:               default
OwnerType:              USER
Owner:                  hadoop
CreateTime:             Mon Mar 11 16:11:44 KST 2024
LastAccessTime:         UNKNOWN
Retention:              0
Location:               hdfs://centos:9010/user/hive/warehouse/emp
Table Type:             MANAGED_TABLE
Table Parameters:
        COLUMN_STATS_ACCURATE   {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"comm\":\"true\",\"deptno\":\"true\",\"empno\":\"true\",\"fname\":\"true\",\"hiredate\":\"true\",\"job\":\"true\",\"lname\":\"true\",\"mgr\":\"true\",\"phone\":\"true\",\"sal\":\"true\"}}
        bucketing_version       2
        numFiles                0
        numRows                 0
        rawDataSize             0
        totalSize               0
        transient_lastDdlTime   1710141104

# Storage Information
SerDe Library:          org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
InputFormat:            org.apache.hadoop.mapred.TextInputFormat
OutputFormat:           org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
Compressed:             No
Num Buckets:            -1
Bucket Columns:         []
Sort Columns:           []
Storage Desc Params:
        field.delim             ,
        line.delim              \n
        serialization.format    ,
Time taken: 0.165 seconds, Fetched: 42 row(s)


#) 조건절 확인
hive> select * from emp where empno = 100;
hive> select * from emp where empno = 100 or empno = 200;
hive> select * from emp where empno in (100, 200);
hive> select * from emp where sal >= 10000;
hive> select * from emp where sal >= 10000 and sal <= 11000;
hive> select * from emp where sal between 10000 and 11000;
hive> select * from emp where lname like 'K%';
hive> select * from emp where lname like '%g';
hive> select * from emp where lname like '_i%';
hive> select * from emp where lname like 'K___';
hive> select * from emp where lname like '%in%' or lname like '%un%';
hive> select * from emp where lname rlike '.*in|un.*';
hive> select * from emp where deptno is null;
hive> select * from emp where deptno is not null;
hive> select * from emp where deptno not in (30, 40, 50, 60);
hive> select * from emp where job not like '%CLERK%';
hive> select * from emp where sal not between 10000 and 20000;

#) 사칙연산
- %: 나머지 계산값
hive> select 1+2, 4-2, 4*2, 4/2, 7%2;

OK
_c0     _c1     _c2     _c3     _c4
3       2       8       2.0     1
Time taken: 0.563 seconds, Fetched: 1 row(s)


hive> select empno, sal, comm, sal * 12 + sal * 12 *nvl(comm, 0) from emp;
hive> select concat(empno, fname) from emp;
hive> select substr(empno, 1,2), substr(fname,-2,2) from emp;

#) 대소문자 구분
hive> select * from emp where lname ='king';
hive> select * from emp where lower(lname) ='king';
hive> select * from emp where lcase(lname) ='king';
hive> select * from emp where upper(lname) ='KING';
hive> select * from emp where ucase(lname) ='KING';

#) 치환하기
hive> select phone, translate(phone, '.', '-') from emp where deptno = 20;

#) 중복 제거
hive> select distinct deptno from emp;

#) 문자 길이
hive> select length(lname) from emp;

hive> select round(45.926, 2), round(45.926), round(45.926, -1), round(55.926, -2);
hive> select trunc(45.926, 2), trunc(45.926), trunc(45.926, -1), trunc(55.926, -2);
hive> select ceil(10.0001), floor(10.0001);

#) 날짜
hive> select * from emp where hiredate like '2003%';
hive> select * from emp where hiredate between to_date('2003-01-01') and to_date('2003-12-31');
hive> select hiredate, date_format(hiredate, 'MM-dd-yyyy') from emp;
hive> select current_date;
hive> select current_timestamp;
hive> select current_date, year(current_date);
hive> select current_date, month(current_date);
hive> select current_date, day(current_date);
hive> select hiredate, year(hiredate) from emp;
hive> select current_timestamp, hour(current_timestamp);
hive> select current_timestamp, minute(current_timestamp);
hive> select current_timestamp, second(current_timestamp);
hive> select current_timestamp, weekofyear(current_timestamp);
#) 일수
hive> select datediff(current_date, '2023-10-05');
hive> select date_add(current_date, 100);
hive> select date_sub(current_date, 100);
hive> select add_months(current_date, 12);
hive> select add_months(current_date, 12, 'yyyyMMdd');
hive> select last_day(current_date);
hive> select next_day(current_date, 'FRIDAY');
hive>  select next_day(current_date, 'FRI');
hive>  select months_between('2003-10-05', current_date);
hive>  select extract(year from current_date), extract(month from current_date), extract(day from current_date);
hive>  select extract(hour from current_timestamp), extract(minute from current_timestamp), extract(second from current_timestamp);

#) 문자형 + 숫자형
hive>  select '100' + 100;
hive>  select cast('100' as int) + 100;
hive>  select cast('100' as double) + 100;
hive>  select cast(100 as string);
hive>  select cast(100.01 as float);
hive>  select cast('true' as boolean);

#) 그룹 함수 
hive>  select count(*), count(deptno) from emp;
hive>  select sum(sal), avg(sal), max(sal), min(sal), stddev_pop(sal), variance(sal) from emp;
hive>  select deptno, sum(sal) 
from emp 
group by deptno 
having sum(sal) > 10000;
hive>  select deptno, sum(sal) sum_sal 
from emp 
group by deptno 
having sum(sal) > 10000 
order by sum_sal desc;

#) 조인은 안시표준으로만 가능.
#) 서브쿼리도 동일.
#) 존재 여부 확인.

 

728x90
반응형
LIST

'Data Base > Hadoop, HDFS' 카테고리의 다른 글

Hadoop 설치  (0) 2024.03.12
Hadoop Distributed File System, HDFS  (0) 2024.03.11