Data Analysis/작업 유형 1 문제 풀이

11~20번 풀이 drop_duplicates('', keep='last'), reset_index(drop=True)

유방울 2023. 5. 29. 13:41

유튜브 공범컨텐츠 동영상 데이터

import pandas as pd


channel =pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/youtube/channelInfo.csv')
video =pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/youtube/videoInfo.csv')

11

# q11
video['ct'] = pd.to_datetime(video['ct'])
answer = video['videoname'].value_counts()
print(answer)

 공범 EP1    3492
 공범 EP2    3204
 공범 EP3    2568
 공범 EP4    2280
 공범 EP5    1562
 공범 EP6    1274
 공범 EP7     555
 공범 EP8     266
Name: videoname, dtype: int64

12

# 12
video.sort_values('ct',ascending=False)[['viewcnt','videoname','ct']]

viewcnt	videoname	ct
15200	1289088	공범 EP8	2021-11-01 15:30:05
13660	1630200	공범 EP7	2021-11-01 15:30:05
14934	1750222	공범 EP6	2021-11-01 15:30:04
13105	1503435	공범 EP5	2021-11-01 15:30:04
3491	3180532	공범 EP1	2021-11-01 15:30:03
...	...	...	...
14	36559	공범 EP1	2021-10-07 19:50:03
13	27451	공범 EP1	2021-10-07 19:40:03
12	19991	공범 EP1	2021-10-07 19:30:03
11	13087	공범 EP1	2021-10-07 19:20:02
10	6396	공범 EP1	2021-10-07 19:10:03
15201 rows × 3 columns

drop_duplicates(keep='last')

중복되는 데이터의 마지막 것을 False 라고 함

# 12
video.sort_values(['videoname','ct']).drop_duplicates('videoname',keep='last')[['viewcnt','videoname','ct']]


viewcnt	videoname	ct
3491	3180532	공범 EP1	2021-11-01 15:30:03
6695	2199328	공범 EP2	2021-11-01 15:30:03
9263	1671294	공범 EP3	2021-11-01 15:30:03
11543	1818493	공범 EP4	2021-11-01 15:30:03
13105	1503435	공범 EP5	2021-11-01 15:30:04
14934	1750222	공범 EP6	2021-11-01 15:30:04
13660	1630200	공범 EP7	2021-11-01 15:30:05
15200	1289088	공범 EP8	2021-11-01 15:30:05

reset_index(drop=True)

기존 인덱스를 칼럼으로 보내기 -> 그리고 drop 하기

# 12
# 기존의 인덱스 drop = True
video.sort_values(['videoname','ct']).drop_duplicates('videoname',keep='last')[['viewcnt','videoname','ct']].reset_index(drop=True)

viewcnt	videoname	ct
0	3180532	공범 EP1	2021-11-01 15:30:03
1	2199328	공범 EP2	2021-11-01 15:30:03
2	1671294	공범 EP3	2021-11-01 15:30:03
3	1818493	공범 EP4	2021-11-01 15:30:03
4	1503435	공범 EP5	2021-11-01 15:30:04
5	1750222	공범 EP6	2021-11-01 15:30:04
6	1630200	공범 EP7	2021-11-01 15:30:05
7	1289088	공범 EP8	2021-11-01 15:30:05

13

# 13
channel['ct'] = pd.to_datetime(channel['ct'])
channel[channel['ct'] >= pd.to_datetime('2021-10-03')]

	channelid	subcnt	viewcnt	videocnt	ct	channelname
12	UCkQCwnkQfgSuPTTnw_Y7v7w	1330000	414984078	738	2021-10-03 03:01:04	꽈뚜룹
13	UCkQCwnkQfgSuPTTnw_Y7v7w	1330000	417316443	738	2021-10-03 09:01:04	꽈뚜룹
14	UCkQCwnkQfgSuPTTnw_Y7v7w	1330000	417316443	738	2021-10-03 15:01:04	꽈뚜룹
15	UCkQCwnkQfgSuPTTnw_Y7v7w	1330000	417316443	738	2021-10-03 21:01:04	꽈뚜룹
16	UCkQCwnkQfgSuPTTnw_Y7v7w	1330000	417316443	738	2021-10-04 03:01:04	꽈뚜룹
...	...	...	...	...	...	...
1390	UCbBzKnCYcG7E3EIkSTdsawA	56700	7528614	57	2021-10-31 15:01:04	Balming Tiger
1391	UCbBzKnCYcG7E3EIkSTdsawA	56700	7528614	57	2021-10-31 21:01:04	Balming Tiger
1392	UCbBzKnCYcG7E3EIkSTdsawA	56800	7528614	57	2021-11-01 03:01:03	Balming Tiger
1393	UCbBzKnCYcG7E3EIkSTdsawA	56800	7539876	57	2021-11-01 09:01:04	Balming Tiger
1394	UCbBzKnCYcG7E3EIkSTdsawA	56800	7539876	57	2021-11-01 15:01:07	Balming Tiger
1309 rows × 6 columns
target = channel[channel['ct'] >= pd.to_datetime('2021-10-03')].sort_values(['ct','channelname']).drop_duplicates('channelname')
target

	channelid	subcnt	viewcnt	videocnt	ct	channelname
393	UCDV9zgWo7b6nPg7i49oRQ5Q	922000	99187513	82	2021-10-03 03:01:02	논리왕 전기
139	UCEI4rb8YldV8v0dVUSo9ToQ	257000	28415226	73	2021-10-03 03:01:02	야전삽짱재
266	UCqwNCDzQBfw2seMRJpL-RWg	322000	71955394	212	2021-10-03 03:01:02	츄정ChuJeong
901	UCOYhhfLMGY8OjsbfGNVf7Lw	7520	1444890	15	2021-10-03 03:01:03	김농밀의 농밀한 삶
1028	UCKOeR5Evsupc77nPL3qCMnw	10100	188523	21	2021-10-03 03:01:03	릴펄 Lilpearl
520	UClu0udsIHkalM416ttWe_Zw	55000	4681137	107	2021-10-03 03:01:03	와글와글 WagleWagle
647	UCM9Rx3EqBJ-jMhD2E8Gjb4g	215000	15902641	60	2021-10-03 03:01:03	조나단
774	UCI9W73BWje69k7f1m_I4mbQ	14900	206221	49	2021-10-03 03:01:03	형사!탐정되다
1276	UCbBzKnCYcG7E3EIkSTdsawA	54300	7050078	54	2021-10-03 03:01:04	Balming Tiger
1155	UCs9H--14toIRdJQAD9qz9tA	471000	128772621	548	2021-10-03 03:01:04	곽토리 kwak tori
12	UCkQCwnkQfgSuPTTnw_Y7v7w	1330000	414984078	738	2021-10-03 03:01:04	꽈뚜룹
# 13
answer = target[['channelname','subcnt']].reset_index(drop=True)
print(answer)

        channelname   subcnt
0            논리왕 전기   922000
1             야전삽짱재   257000
2        츄정ChuJeong   322000
3        김농밀의 농밀한 삶     7520
4       릴펄 Lilpearl    10100
5   와글와글 WagleWagle    55000
6               조나단   215000
7           형사!탐정되다    14900
8     Balming Tiger    54300
9     곽토리 kwak tori   471000
10              꽈뚜룹  1330000