[Pandas]데이터 구조, 분석

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
 
###데이터 정보 확인###
#데이터 미리 살펴보기#
data= pd.read_csv("C:/Users/ZenBook/Desktop/code/sample/part3/auto-mpg.csv")
print(data.head()) #head() 메소드를 사용, (n) 가장 위의 n개의 항목만 보여준다.
 
data.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
                'acceleration', 'model year', 'origion', 'name'] #데이터에 맞게 열 이름을 지정한다.
 
print(data.head(5))
'''
    mpg  cylinders  displacement horsepower  weight  acceleration  model year  origion                name
0  15.0          8         350.0      165.0  3693.0          11.5          70        1   buick skylark 320
1  18.0          8         318.0      150.0  3436.0          11.0          70        1  plymouth satellite
2  16.0          8         304.0      150.0  3433.0          12.0          70        1       amc rebel sst
'''
 
print(data.shape) #(397, 9), .shape, 데이터프레임의 행, 열의 개수를 투플형태로 보여준다.
print(data.info()) # .info(), 데이터프레임에 대한 기본적인 정보를 출력한다.
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   mpg           397 non-null    float64
 1   cylinders     397 non-null    int64
 2   displacement  397 non-null    float64
 3   horsepower    397 non-null    object
 4   weight        397 non-null    float64
 5   acceleration  397 non-null    float64
 6   model year    397 non-null    int64
 7   origion       397 non-null    int64
 8   name          397 non-null    object
dtypes: float64(4), int64(3), object(2)
memory usage: 28.0+ KB
None
'''
 
print(data.describe()) # .describe(),데이터 통계 정보 출력, include='all'을 적으면 문자열 데이터가 있는
                       # 열을 대상으로도 통계 정보를 출력한다.
'''
              mpg   cylinders  displacement       weight  acceleration  model year     origion
count  397.000000  397.000000    397.000000   397.000000    397.000000  397.000000  397.000000
mean    23.528463    5.448363    193.139798  2969.080605     15.577078   76.025189    1.574307
std      7.820926    1.698329    104.244898   847.485218      2.755326    3.689922    0.802549
min      9.000000    3.000000     68.000000  1613.000000      8.000000   70.000000    1.000000
25%     17.500000    4.000000    104.000000  2223.000000     13.900000   73.000000    1.000000
50%     23.000000    4.000000    146.000000  2800.000000     15.500000   76.000000    1.000000
75%     29.000000    8.000000    262.000000  3609.000000     17.200000   79.000000    2.000000
max     46.600000    8.000000    455.000000  5140.000000     24.800000   82.000000    3.000000
'''
 
 
#데이터 개수 확인#
#count(), value_count() 메소드 사용#
print(data.count()) # 데이터의 개수를 센다.
'''
mpg             397
cylinders       397
displacement    397
horsepower      397
weight          397
acceleration    397
model year      397
origion         397
name            397
dtype: int64
'''
 
print(data['origion'].value_counts()) # 각 열의 고유값이 몇 개인지를 세는 메소드.
 
 
#통계 함수#
#평균값#
#mean() 메소드 사용#
print(data.mean())
'''
  print(data.mean())
mpg               23.528463
cylinders          5.448363
displacement     193.139798
weight          2969.080605
acceleration      15.577078
model year        76.025189
origion            1.574307
dtype: float64
'''
 
print(data['weight'].mean()) #2969.080604534005, weight 열의 평균값 출력
 
#중간값#
#median() 메소드 사용#
#문자열 데이터는 아스키코드로 변환한 후 크고 작음을 계산한다.
print(data.mpg.median()) #23.0, mpg 열의 중간값 출력
 
#최대값#
#max() 메소드 사용#
print(data.cylinders.max()) #8, cylinders 열의 최대값 출력
 
#최소값#
#min() 메소드 사용#
print(data.weight.min()) #1613.0, weight 열의 최소값 출력
 
#표준편차#
#std() 메소드 사용#
#문자열 데이터는 표준편차를 구할 때 포함하지 않는다#
print(data.weight.std()) #847.4852184591211, weight 열의 표준편차 출력
 
#상관계수#
#corr() 메소드 사용# 
#문자열 데이터는 상관계수를 구할 때 포함하지 않는다#
print(data[['mpg', 'weight']].corr()) # 두 개의 리스트에서 상관관계를 계산한다
'''
             mpg    weight
mpg     1.000000 -0.831558
weight -0.831558  1.000000
'''
Colored by Color Scripter
cs
'Pandas' 카테고리의 다른 글

Matplotlib 면적 그래프, 막대 그래프 그리기 (0)	2022.11.05
Matplotlib 그래프 그리기 기초, 선 그래프 (1)	2022.11.04
[Pandas]데이터 저장하기 (0)	2022.11.03
[Pandas]웹에서 데이터 가져오기 (0)	2022.11.03
[Pandas]외부 파일 불러오기 (0)	2022.11.02
'Pandas' 카테고리의 다른 글

티스토리툴바