Language/Python

231122 Python line plot(꺾은선그래프), 도수분포표(frequency table), histogram, box plot, 줄기 잎 그림(stem and leaf diagram), 산점도(scatter plot), folium

잇꼬 2023. 11. 22. 17:07
728x90
반응형
SMALL

# 기본 import

from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname='C:/Windows/Fonts/gulim.ttc').get_name()
rc('font', family=font_name)

import numpy as np # 색상 import
import stemgraphic # 줄기 잎 그림 그래프
import folium # 지도 시각화 라이브러리


# 원하는 파일 read_csv 불러오기

emp = pd.read_csv('c:/data/emp.csv')
emp.HIRE_DATE = pd.to_datetime(emp.HIRE_DATE)
emp.info()



3. line plot (= 꺾은선 그래프)
    #1) 선을 그리는 그래프
    #2) 시간, 순서 등에 따라 어떻게 변하는지를 보여주는 그래프(daliy용으로 많이 사용)

years = emp['HIRE_DATE'].dt.year.value_counts()
years
years.sort_index() # 정렬
years.sort_index(inplace=True)
years.plot() # 그래프 출력
# 그래프 꾸밈연습
plt.xticks(ticks=years.index, 
           labels=[str(i)+'년' for i in years.index],
           rotation= 45)
plt.xlabel('')
plt.ylabel('인원수', size=10)
plt.title('년도별 입사현황', size=20)
plt.show()



# 막대그래프 

plt.bar(x=years.index, height=years)


# 색상
# 구간 내에 숫자를 생성하는 함수

np.linspace(start=0, stop=1, num=10, endpoint=False)
np.linspace(start=0, stop=1, num=10, endpoint=True)

cmap = plt.get_cmap("PuRd")
cmap
colors =[cmap(i) for i in np.linspace(start=0, stop=1, num=8, endpoint=True) ]
colors

plt.bar(x=years.index, height=years, color=colors)
plt.text(2005, 29, '최대값')
plt.annotate(text='max', 
             xy=(2005, 29), # 튜플형식
             xytext=(2001, 20))

plt.bar(x=years.index, height=years, color=colors)
plt.annotate(text='max', 
             xy=(2005, 29), # 튜플형식
             xytext=(2001, 20), 
             arrowprops={'arrowstyle':'wedge',
                         'facecolor':'red',
                         'color':'blue'})



# 최대값 강조

plt.plot(years.index, years)
plt.xticks(ticks=years.index, 
           labels=[str(i)+'년' for i in years.index],
           rotation= 45)
plt.xlabel('')
plt.ylabel('인원수', size=10)
plt.title('년도별 입사현황', size=20)
plt.annotate(text='', 
             xy=(2005, 29), # 튜플형식
             xytext=(2001, 20), 
             arrowprops={'arrowstyle':'wedge',
                         'facecolor':'red',
                         'color':'blue'})
plt.show()



# linestyle

dashdot dashdot dotted  solid 
-. -- : -


# linestyle 변경

plt.plot(years.index, years, linestyle='dashdot')
plt.xticks(ticks=years.index, 
           labels=[str(i)+'년' for i in years.index],
           rotation= 45)
plt.xlabel('')
plt.ylabel('인원수', size=10)
plt.title('년도별 입사현황', size=20)
plt.annotate(text='', 
             xy=(2005, 29), # 튜플형식
             xytext=(2001, 20), 
             arrowprops={'arrowstyle':'wedge',
                         'facecolor':'red',
                         'color':'blue'})
plt.show()


# 도수분포표(frequency table)
    1) 미리 구간을 설정해 각 구간의 범위안에 조사된 데이터들이 몇개씩 속하는가를 나타내는 표
    

ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69]
# ========================
#  계급           도수
# ========================
#  20대(20~29)     5
#  30대(30~39)     3
#  40대(40~49)     3
#  50대(50~59)     3
#  60대(60~69)     2
# ========================
# 반복문+딕션러리(계급:건수)


# setdefault(key, value) : key, value 설정

frequency_table = {} # 딕션러리 변수명 선언
for i in range(20, 61, 10):
    frequency_table.setdefault(i,0)
frequency_table


# ages 값이 print로 출력되는지 확인

for i in ages:
    print(i)


# ages를 for문을 활용, if구문으로 건수 출력

for i in ages:
    if i >=20 and i < 30:
        frequency_table[20] += 1
    elif i >=30 and i < 40:
        frequency_table[30] += 1
    elif i >= 40 and i < 50:
        frequency_table[40] += 1
    elif i >= 50 and i < 60:
        frequency_table[50] += 1
    elif i>= 60:
        frequency_table[60] += 1

frequency_table



# 수치형 자료를 범주형 자료로 변환한 후 빈도수를 구하세요.

ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69] # 수치형 자료
ages_label = ['20대', '20대', ... , '60대', '60대'] # 범주형 자료

ages_label = [] # 범주형 자료인 변수 선언, list 타입

for i in ages:
    if i >=20 and i < 30:
        ages_label.append('20대')
    elif i >=30 and i < 40:
        ages_label.append('30대')
    elif i >= 40 and i < 50:
        ages_label.append('40대')
    elif i >= 50 and i < 60:
        ages_label.append('50대')
    elif i>= 60:
        ages_label.append('60대')

 

# ☆가장 편한 코드

Series(ages_label).value_counts() # Series 변환 후 count하기
pd.crosstab(index=Series(ages_label), columns='빈도수')
# unique 값 
Series(ages_label).unique()


# np로 unique 확인

np.unique(Series(ages_label))
np.unique(Series(ages_label), return_counts=True) # 빈도수값도 array으로 확인

x = np.unique(Series(ages_label), return_counts=True)[0] # 계급
y = np.unique(Series(ages_label), return_counts=True)[1] # 빈도수

DataFrame({'계급':x, '도수':y})


■ cut
    #1) 연속형 데이터를 범주형 데이터로 변환하는 함수

ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69]
bins = [20, 30, 40, 50, 60, 70] # 범위 설정, 맞지 않는다면 오류발생

# right=True : (20, 30] = 20 < ages <= 30
pd.cut(x = ages, bins = bins, right=True)
# right=False(left) : [20, 30) = 20 <= ages < 30
# cut(left=True) 는 표현식이 되지 않음.
pd.cut(x = ages, bins = bins, right=False)


# 범주형 counts 확인

pd.cut(x = ages, bins = bins, right=False).value_counts()

age_cut = pd.cut(x = ages, bins = bins, right=False)
age_cut.value_counts()
age_cut.codes # categories index 값
#  0,  0,  0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4 : codes(index)
# 21, 24, 26, 27, 29, 31, 37, 39, 40, 42, 45, 50, 51, 59, 60, 69 : ages(값)
age_cut.categories


# =============================================================================

ages = [21,24,26,27,29,31,37,39,40,42,45,50,51,59,60,69]
bins = [20, 30, 40, 50, 60, 70]
label = ['20대', '30대', '40대', '50대', '60대']

age_cut = pd.cut(x = ages, bins = bins, right=False, labels=label)
age_cut.value_counts()
age_cut.codes
age_cut.categories


4. histogram 
    #1) 자료가 모여 있는 위치나 자료의 분포에 관한 대략적인 정보를 한 눈에 파악할 수 있는 그래프

ages = [21,24,26,27,29,31,37,40,39,40,42,45,50,51,59,60,69,22]

plt.hist(ages)
plt.hist(ages, bins=5)
plt.hist(ages, bins=10)

bins = [20, 30, 40, 50, 60, 70]
plt.hist(ages, bins=bins)

plt.hist(ages, bins=bins, density=True, histtype='step')
plt.hist(ages, 
         bins=bins, # 범위
         orientation='horizontal', #수평막대 그래프
         color='pink', 
         rwidth=0.5 ) # 간격


# array 형식

weight = np.loadtxt('c:/data/weight.txt')
weight
weight.shape # 행, 열
weight.reshape((50,)) # 1차원 모형, 50열
weight.reshape((50,)).shape
weight.reshape((1,50)).shape # [[]] : 차원 늘어남
weight.reshape((50,1)) # 50행 1열

weight = weight.reshape((50,))
weight.shape
weight.max()
weight.min()

bins = list(range(50, 101, 10))
bins
label = [str(i)+'kg 이상' for i in bins]
label.pop() # 마지막 구간은 삭제
label
pd.cut(x=weight, 
       bins=bins, labels=label) # bins 구간갯수와 labels 갯수와 동일해야 한다.



# =============================================================================

emp['SALARY'].describe()
emp['LAST_NAME'].describe()

Series(weight).describe()

Series(weight).quantile(0)
Series(weight).quantile(.10) # 10% 값
Series(weight).quantile(.25) # 25% 값
Series(weight).quantile(.50)
Series(weight).quantile(.75)
Series(weight).quantile(.90)
Series(weight).quantile(1.00)

np.percentile(weight, 0)
np.percentile(weight, 25)
np.percentile(weight, [0,25,50,75,100])
# 상위 95% 값
weight[weight >= np.percentile(weight, 95)]
# 하위 10%  값
weight[weight <= np.percentile(weight, 10)]


5. box plot
    #1) 데이터가 어떤 범위에 걸쳐 존재하는지 분포를 페크할 떄 사용되는 그래프이다.
    #2) 장점ⓐ: 다섯가지 수치 요약을 제공하는 그래프 
    #3) 장점ⓑ: 이상치 데이터를 확인 할때 좋은 그래프

plt.boxplot(weight)
plt.boxplot(weight, labels=['몸무게']) # labels:list로 작성
plt.boxplot(weight, labels=['몸무게'], vert=False) # 수평형


# 사분위수(Quartile)
    #1) 데이터 표본을 동일하게 4개로 나눈 값을 확인하는 방법(4등분)

min = np.percentile(weight, 0)
q1= np.percentile(weight, 25)
q2 = np.percentile(weight, 50)
q3 = np.percentile(weight, 75)
max = np.percentile(weight, 100)


# 사분위범위(Inter Quartile Range)
    #1) 1사분위수 와 3사분위수 사이의 거리

iqr = q3 - q1
iqr


# 최저 한계치, 최대 한계치
# lower fence: 보다 작으면 이상치

lf = q1 - 1.5* iqr # 최저한계치
lf


# upper fence: 보다 크면 이상치

uf = q3 + 1.5* iqr # 최대 한계치
uf


# 이상치 데이터

weight[weight < lf]
weight[weight > uf]

weight[weight >= lf].min() # 최소값
weight[weight <= uf].max() # 최대값



plt.boxplot(weight, labels=['몸무게'], vert=False)
plt.text(weight[weight < lf][0],
         1.05,
         weight[weight < lf][0],
         color='red')
plt.text(q1, 1.1, q1, color='blue')
plt.text(q2, 1.1, q2, color='blue')
plt.text(q3, 1.1, q3, color='blue')
plt.text(weight[weight >= lf].min(),
         1.07, 
         weight[weight >= lf].min(),
         color='red')
plt.text(weight[weight <= uf].max(),
         1.05, 
         weight[weight <= uf].max(),
         color='red')

height = pd.read_excel('c:/data/height.xlsx')
height.info()
height.describe()

plt.boxplot(height['남자'])
plt.boxplot(height['여자'])
plt.boxplot([height['남자'],height['여자']], labels=['남자', '여자'])

plt.hist(height['남자'])
plt.hist(height['여자'])
plt.hist([height['남자'],height['여자']], label=['남자', '여자'])
plt.legend()



6. 줄기 잎 그림(stem and leaf diagram)
    # 연속형자료의 특성을 나타내고자 할때 사용하는 그래프
# Anaconda Prompt 에서 설치
# (base) C:\Windows\System32>pip install stemgraphic    

import stemgraphic
stemgraphic.stem_graphic(height.남자)
stemgraphic.stem_graphic(height.여자)


7. 산점도(scatter plot)
    #1) 두 연속형 변수 사이의 관계(선형관계)를 보여주는 그래프 
    #2) 회계분석 ex) 광고매출액-영업이익액

plt.scatter(emp['DEPARTMENT_ID'], emp['SALARY']) #비선형관계

plt.scatter(emp['DEPARTMENT_ID'], 
            emp['SALARY'],
            marker='v')

plt.scatter(emp['DEPARTMENT_ID'], 
            emp['SALARY'],
            marker='s')

plt.scatter(emp['DEPARTMENT_ID'], 
            emp['SALARY'],
            marker='d')

plt.scatter(emp['DEPARTMENT_ID'], 
            emp['SALARY'],
            marker='d', 
            color='red')


8. folium
    #1) 지도 시각화 라이브러리

import folium # 오류발생 -> 아나콘다에서 install 하기


# (base) C:\Windows\System32>pip install folium

import folium

latitude = 37.498952 # 위도
longitude = 127.031775 # 경도

m = folium.Map(location=[latitude, longitude], 
               zoom_start=15, 
               tiles = 'Stamen Terrain')
folium.Marker(location=[latitude, longitude],
              popup='itwill',
              icon=folium.Icon(color='red', icon='star')).add_to(m)
m.save('c:/data/seoul.html')

m = folium.Map(location=[latitude, longitude], zoom_start=18)
folium.CircleMarker(location=[latitude, longitude],
                    color='red',
                    radius=30, 
                    tooltip='학원주변').add_to(m)
m.save('c:/data/itwill_map.html')
728x90
반응형
LIST