영화 평점 데이터 분석¶

import pandas as pd import numpy as np

데이터 읽어오기.¶

사용자 데이터: users
평점 데이터 : ratings
영화 데이터 : movies

## users.dat 파일 컬럼(유저 정보 파일) : 유저id, 성별
## pandas를 이용 읽어온다. '::' (구분자)
## 경고 문구에 따라 설정 변경 ==> engine='python'
## 첫 라인의 데이터가 열제목으로 자동 셋팅... 첫라인으로 데이터로 읽어와야 함. 
pd.read_csv('data/movielens/users.dat', sep='::', engine='python', header=None)

## 칼럼제목 지정해서 읽기, header=None은 names가 지정되면 자동으로 셋팅되므로 생략 가능.
pd.read_csv('data/movielens/users.dat', sep='::', engine='python', names=['user_id', '성별', '연령', '직업', '지역'])

# 결과를 users 변수에 넣기.....
users = pd.read_csv(
    'data/movielens/users.dat', 
    sep='::', 
    engine='python', 
    names=['user_id', '성별', '연령', '직업', '지역']
)

users[0:10]

# 평점 읽기
ratings = pd.read_csv(
    'data/movielens/ratings.dat',
    sep='::', engine='python',
    names=['user_id', 'movie_id', "평점", '타임스템프']
)
## 시간이 걸림....

len(ratings)

1000209

ratings[:5]

# 영화정보 읽기
movies = pd.read_csv(
    'data/movielens/movies.dat',
    sep='::', engine='python',  encoding='latin-1',
    names=['movie_id', '제목', '장르']
)

movies[:5]

데이터 병합¶

users, ratings, movies를 DB에서 join하듯이 합치기.

## 각각의 건수 확인
print("사용자:{0:,}, 영화:{1:,}, 평점:{2:,}".format(len(users), len(movies), len(ratings)))

사용자:6,040, 영화:3,883, 평점:1,000,209

# 3개의 users, ratings, movies를 통합해서 분석.
# users 약 6,000건, users-ratings 연결. ratings-movies 연결.
# movies 약 4,000건
# ratings 약 100만건.

1. 평점과 유저를 병합..... : key --> user_id¶

merge 함수 사용.: 원본데이터를 변경 없음. 합쳐진 데이터가 반환 된다.
- 첫번째 인자가 기준.
- 두번째 인자는 합쳐지는 데이터.
- 합칠때 키를 on= 옵션으로 셋팅.

data = pd.merge(ratings, users, on='user_id')
data[:10]

2. 유저-평점 과 영화를 병합....¶

## movie_id를 기준으로 병합.
data = pd.merge(data, movies, on='movie_id')
data[:5]

## 위 과정을 한줄로...
data = pd.merge(
    pd.merge(ratings, users, on='user_id'), movies, on='movie_id'
)
data[:3]

len(data)

1000209

3. 영화별 성별 평균 평점.¶

결과표¶

영화/성별 - 평점평균 --> 피봇팅.
제목 / 성별 F M
.... *영화제목 30 40
....

# 피봇팅 pivot_table(데이타필드, 집계방식, 행인덱스필드, 칼럼필드)
# 제목 : 행필드, 성 : 칼럼필드, 평점 평균으로 데이터필드....
평점기준 = data.pivot_table('평점', aggfunc='mean', index='제목', columns='성별')
평점기준[:10]

영화의 평점 건수가 250건 이상인것만.... 분석.¶

# 평점 건수가 250 건 이상.
# 영화별로 그룹바이.
영화별 = data.groupby('제목')

# 영화제목별 도수는 size 함수를 이용
평점건수 = 영화별.size()

평점건수[:10]

제목
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64

## 250건 이상을 필터하기위한 조건....
평점기준 = 평점건수 >= 250
평점기준[:10]

제목
$1,000,000 Duck (1971)               False
'Night Mother (1986)                 False
'Til There Was You (1997)            False
'burbs, The (1989)                    True
...And Justice for All (1979)        False
1-900 (1994)                         False
10 Things I Hate About You (1999)     True
101 Dalmatians (1961)                 True
101 Dalmatians (1996)                 True
12 Angry Men (1957)                   True
dtype: bool

평점평균 = data.pivot_table('평점', aggfunc='mean', index='제목', columns='성별')
# 불리언필터 적용.
평점평균 = 평점평균[평점기준]

평점평균[:10]

## 여성에게 높은 평점을 받은 영화 상위 10
평점평균.sort_values(by='F', ascending=False)[:10]

## 남성 평점 Top 10
평점평균.sort_values(by='M', ascending=False)[:10]

# 남녀가 호불호가 갈리는 영화...
# 평점평균의 차이가 크면 .....
# 호불호에 대한 수치화.... 남자평균 - 여자평균
평점평균['M'] - 평점평균['F']

제목
'burbs, The (1989)                                                    0.168607
10 Things I Hate About You (1999)                                    -0.334586
101 Dalmatians (1961)                                                -0.291444
101 Dalmatians (1996)                                                -0.328785
12 Angry Men (1957)                                                   0.144024
13th Warrior, The (1999)                                              0.056000
2 Days in the Valley (1996)                                          -0.244076
20,000 Leagues Under the Sea (1954)                                   0.039102
2001: A Space Odyssey (1968)                                          0.304156
2010 (1984)                                                          -0.033097
28 Days (2000)                                                       -0.231717
39 Steps, The (1935)                                                  0.142175
54 (1998)                                                             0.080424
7th Voyage of Sinbad, The (1958)                                      0.249788
8MM (1999)                                                           -0.055288
About Last Night... (1986)                                           -0.047770
Absent Minded Professor, The (1961)                                  -0.022579
Absolute Power (1997)                                                -0.141377
Abyss, The (1989)                                                     0.030272
Ace Ventura: Pet Detective (1994)                                     0.197917
Ace Ventura: When Nature Calls (1995)                                 0.273670
Addams Family Values (1993)                                          -0.121469
Addams Family, The (1991)                                            -0.022672
Adventures in Babysitting (1987)                                     -0.247660
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)    0.093810
Adventures of Priscilla, Queen of the Desert, The (1994)             -0.300260
Adventures of Robin Hood, The (1938)                                 -0.248299
African Queen, The (1951)                                            -0.100410
Age of Innocence, The (1993)                                         -0.487561
Agnes of God (1985)                                                  -0.289986
                                                                        ...   
White Men Can't Jump (1992)                                           0.202284
Who Framed Roger Rabbit? (1988)                                       0.143873
Who's Afraid of Virginia Woolf? (1966)                                0.067236
Whole Nine Yards, The (2000)                                          0.108262
Wild Bunch, The (1969)                                                0.491736
Wild Things (1998)                                                    0.067082
Wild Wild West (1999)                                                -0.143476
William Shakespeare's Romeo and Juliet (1996)                        -0.213965
Willow (1988)                                                        -0.205139
Willy Wonka and the Chocolate Factory (1971)                         -0.274480
Witness (1985)                                                       -0.174349
Wizard of Oz, The (1939)                                             -0.151892
Wolf (1994)                                                          -0.174992
Women on the Verge of a Nervous Breakdown (1988)                     -0.068566
Wonder Boys (2000)                                                   -0.130147
Working Girl (1988)                                                  -0.294242
World Is Not Enough, The (1999)                                       0.051389
Wrong Trousers, The (1993)                                           -0.109974
Wyatt Earp (1994)                                                     0.136839
X-Files: Fight the Future, The (1998)                                 0.004323
X-Men (2000)                                                          0.169391
Year of Living Dangerously (1982)                                    -0.081817
Yellow Submarine (1968)                                              -0.025000
You've Got Mail (1998)                                               -0.266834
Young Frankenstein (1974)                                            -0.050785
Young Guns (1988)                                                     0.053825
Young Guns II (1990)                                                 -0.030758
Young Sherlock Holmes (1985)                                         -0.151362
Zero Effect (1998)                                                   -0.141266
eXistenZ (1999)                                                       0.190494
Length: 1216, dtype: float64

# 평균평점에 차이를 새로운 칼럼으로 추가.
평점평균['차이'] = 평점평균['M'] - 평점평균['F']

평점평균[:10]

## 남자가 호감도가 높은 영화순으로 나열하시오.....
평점평균.sort_values(by='차이', ascending=False)[:10]

## 여자가 호감도가 높은... 여자 > 남자... 차이... 작은 것.....
평점평균.sort_values(by='차이', ascending=True)[:10]

# 남녀 선호도 상관 없이 호불호가 큰 순서....
np.abs(평점평균['차이']).sort_values(ascending=False)[:10]

제목
Dirty Dancing (1987)                      0.830782
Good, The Bad and The Ugly, The (1966)    0.726351
Kentucky Fried Movie, The (1977)          0.676359
Jumpin' Jack Flash (1986)                 0.676359
Dumb & Dumber (1994)                      0.638608
Longest Day, The (1962)                   0.619682
Cable Guy, The (1996)                     0.613787
Evil Dead II (Dead By Dawn) (1987)        0.611985
Grease (1978)                             0.608224
Hidden, The (1987)                        0.607167
Name: 차이, dtype: float64

성별 무관하게 호불호가 갈리는 영화¶

표준편차를 이용한다.

# 집계 다시...
영화별 = data.groupby("제목")

# 평점의 표준편차를 구한다. 
평점편차 = 영화별['평점'].std()

# 250건 미만 제거.(앞에서 만든 불리언필트 사용.)
평점편차 = 평점편차[평점기준]

평점편차.sort_values(ascending=False)[:10]

제목
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: 평점, dtype: float64

불리언 색인¶

names = ['이성주', '김성주', '홍길동', '임꺽정', '김희선', '강호동']

## ndarray에서 불리언 필터

names = np.array(names)
names

array(['이성주', '김성주', '홍길동', '임꺽정', '김희선', '강호동'], dtype='<U3')

필터 = names == '김성주'

names[필터]

array(['김성주'], dtype='<U3')

names[names == '홍길동']

array(['홍길동'], dtype='<U3')

# 판다스 Series도 동일하게 동작을 한다. 
names = pd.Series(names)
names

0    이성주
1    김성주
2    홍길동
3    임꺽정
4    김희선
5    강호동
dtype: object

names[names == '김성주']

1    김성주
2    홍길동
dtype: object

(names != '김성주')

0     True
1    False
2     True
3     True
4     True
5     True
dtype: bool

# 파이썬 예약어인 and와 or를 블리언 배열에서는 사용할 수 없다. 
# 이는 두 예약어를 사용하면 Series끼리의 연산이 되어버리기 때문이다. 
# and -> &, or -> |
(names == '김성주') | (names == '홍길동')

0    False
1     True
2     True
3    False
4    False
5    False
dtype: bool

names[(names =='이성주') | (names == '김성주')]

0    이성주
1    김성주
dtype: object

# 이것을 for 문을 사용해 직접 골라내야 한다면...
choice = []
for nm in names:
    if nm == '이성주' or nm == '김성주' :
        choice.append(nm)
choice

['이성주', '김성주']

연습¶

대표장르의 도수 측정
대표장르별 성별 평점 개수. "성별에 따라 선호도 장르?"
대표장르별 성별 평점 평균
대표장르별 성별 평점 편차.

	user_id	movie_id	평점	타임스템프
0	1	1193	5	978300760
1	1	661	3	978302109
2	1	914	3	978301968
3	1	3408	4	978300275
4	1	2355	5	978824291

	movie_id	제목	장르
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy

	user_id	movie_id	평점	타임스템프	성별	연령	직업	지역
0	1	1193	5	978300760	F	1	10	48067
1	1	661	3	978302109	F	1	10	48067
2	1	914	3	978301968	F	1	10	48067
3	1	3408	4	978300275	F	1	10	48067
4	1	2355	5	978824291	F	1	10	48067
5	1	1197	3	978302268	F	1	10	48067
6	1	1287	5	978302039	F	1	10	48067
7	1	2804	5	978300719	F	1	10	48067
8	1	594	4	978302268	F	1	10	48067
9	1	919	4	978301368	F	1	10	48067

	user_id	movie_id	평점	타임스템프	성별	연령	직업	지역	제목	장르
0	1	1193	5	978300760	F	1	10	48067	One Flew Over the Cuckoo's Nest (1975)	Drama
1	2	1193	5	978298413	M	56	16	70072	One Flew Over the Cuckoo's Nest (1975)	Drama
2	12	1193	4	978220179	M	25	12	32793	One Flew Over the Cuckoo's Nest (1975)	Drama
3	15	1193	4	978199279	M	25	7	22903	One Flew Over the Cuckoo's Nest (1975)	Drama
4	17	1193	5	978158471	M	50	1	95350	One Flew Over the Cuckoo's Nest (1975)	Drama

성별	F	M
제목
$1,000,000 Duck (1971)	3.375000	2.761905
'Night Mother (1986)	3.388889	3.352941
'Til There Was You (1997)	2.675676	2.733333
'burbs, The (1989)	2.793478	2.962085
...And Justice for All (1979)	3.828571	3.689024
1-900 (1994)	2.000000	3.000000
10 Things I Hate About You (1999)	3.646552	3.311966
101 Dalmatians (1961)	3.791444	3.500000
101 Dalmatians (1996)	3.240000	2.911215
12 Angry Men (1957)	4.184397	4.328421

내 마음대로 공간

파이썬(Python) - 분석 실습 영화평점분석

영화 평점 데이터 분석¶

데이터 읽어오기.¶

데이터 병합¶

1. 평점과 유저를 병합..... : key --> user_id¶

2. 유저-평점 과 영화를 병합....¶

3. 영화별 성별 평균 평점.¶

결과표¶

영화의 평점 건수가 250건 이상인것만.... 분석.¶

성별 무관하게 호불호가 갈리는 영화¶

불리언 색인¶

연습¶

'프로그래밍 > Python' 카테고리의 다른 글

공유하기

'프로그래밍/Python' 의 관련글

티스토리툴바

	0	1	2	3	4
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455
5	6	F	50	9	55117
6	7	M	35	1	06810
7	8	M	25	12	11413
8	9	M	25	17	61614
9	10	F	35	1	95370
10	11	F	25	1	04093
11	12	M	25	12	32793
12	13	M	45	1	93304
13	14	M	35	0	60126
14	15	M	25	7	22903
15	16	F	35	0	20670
16	17	M	50	1	95350
17	18	F	18	3	95825
18	19	M	1	10	48073
19	20	M	25	14	55113
20	21	M	18	16	99353
21	22	M	18	15	53706
22	23	M	35	0	90049
23	24	F	25	7	10023
24	25	M	18	4	01609
25	26	M	25	7	23112
26	27	M	25	11	19130
27	28	F	25	1	14607
28	29	M	35	7	33407
29	30	F	35	7	19143
...	...	...	...	...	...
6010	6011	M	35	15	80538
6011	6012	M	35	15	02871
6012	6013	F	25	20	32301
6013	6014	M	45	1	80634
6014	6015	F	25	9	80013
6015	6016	M	45	1	37209
6016	6017	F	35	7	21117
6017	6018	M	35	1	48906
6018	6019	M	25	0	10024

	user_id	성별	연령	직업	지역
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455
5	6	F	50	9	55117
6	7	M	35	1	06810
7	8	M	25	12	11413
8	9	M	25	17	61614
9	10	F	35	1	95370
10	11	F	25	1	04093
11	12	M	25	12	32793
12	13	M	45	1	93304
13	14	M	35	0	60126
14	15	M	25	7	22903
15	16	F	35	0	20670
16	17	M	50	1	95350
17	18	F	18	3	95825
18	19	M	1	10	48073
19	20	M	25	14	55113
20	21	M	18	16	99353
21	22	M	18	15	53706
22	23	M	35	0	90049
23	24	F	25	7	10023
24	25	M	18	4	01609
25	26	M	25	7	23112
26	27	M	25	11	19130
27	28	F	25	1	14607
28	29	M	35	7	33407
29	30	F	35	7	19143
...	...	...	...	...	...
6010	6011	M	35	15	80538
6011	6012	M	35	15	02871
6012	6013	F	25	20	32301
6013	6014	M	45	1	80634
6014	6015	F	25	9	80013
6015	6016	M	45	1	37209
6016	6017	F	35	7	21117
6017	6018	M	35	1	48906
6018	6019	M	25	0	10024

	user_id	성별	연령	직업	지역
0	1	F	1	10	48067
1	2	M	56	16	70072
2	3	M	25	15	55117
3	4	M	45	7	02460
4	5	M	25	20	55455
5	6	F	50	9	55117
6	7	M	35	1	06810
7	8	M	25	12	11413
8	9	M	25	17	61614
9	10	F	35	1	95370

성별	F	M
제목
Close Shave, A (1995)	4.644444	4.473795
Wrong Trousers, The (1993)	4.588235	4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)	4.572650	4.464589
Wallace & Gromit: The Best of Aardman Animation (1996)	4.563107	4.385075
Schindler's List (1993)	4.562602	4.491415
Shawshank Redemption, The (1994)	4.539075	4.560625
Grand Day Out, A (1992)	4.537879	4.293255
To Kill a Mockingbird (1962)	4.536667	4.372611
Creature Comforts (1990)	4.513889	4.272277
Usual Suspects, The (1995)	4.513317	4.518248