728x90
데이터 파싱하기
사용할 데이터는 누적된 수치를 나타내는 데이터입니다.
우선 XML로 응답 받은 데이터를 저장하고 읽어서 JSON 으로 파싱해보겠습니다.
일일 트래픽이 10000건으로 제한되어있습니다.
10000건의 트래픽은 적지 않지만, 테스트할 때는 응답받은 데이터를 파일로 저장하는 것도 하나의 방법입니다.
# Tip
xml.dom.minidom : xml을 출력할 때 들여쓰기를 해줍니다.
'''
보건복지부_코로나19 연령별·성별감염_현황
'''
import requests
from urllib import parse
from bs4 import BeautifulSoup as bs
import datetime
import json
import xml.dom.minidom
url = 'http://openapi.data.go.kr/openapi/service/rest/Covid19/getCovid19GenAgeCaseInfJson'
key = 'gIQmqloKO%2B9e8xRB3iSW6t9uW%2FKem82g4ELNU%2F3dzkhOfcf5%2BnHjdamtop%2FCRAWlyyTlhk7W1F0Jm1GX9w%3D%3D'
today = datetime.datetime.now().strftime('%Y%m%d')
queryParams = f'?{parse.quote_plus("ServiceKey")}={key}&' + parse.urlencode({
parse.quote_plus('pageNo') : '1',
parse.quote_plus('numOfRows') : '10',
parse.quote_plus('startCreateDt') : today,
parse.quote_plus('endCreateDt') : today
})
res = requests.get(url + queryParams)
xml = xml.dom.minidom.parseString(res.text)
pretty_xml = xml.toprettyxml()
with open('result.xml', 'w', encoding='utf-8') as f:
f.write(pretty_xml)
더보기
<?xml version="1.0" ?>
<response>
<header>
<resultCode>00</resultCode>
<resultMsg>NORMAL SERVICE.</resultMsg>
</header>
<body>
<items>
<item>
<confCase>222</confCase>
<confCaseRate>1.64</confCaseRate>
<createDt>2020-07-15 10:40:12.153</createDt>
<criticalRate>0</criticalRate>
<death>0</death>
<deathRate>0.00</deathRate>
<gubun>0-9</gubun>
<seq>2026</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>753</confCase>
<confCaseRate>5.56</confCaseRate>
<createDt>2020-07-15 10:40:12.153</createDt>
<criticalRate>0</criticalRate>
<death>0</death>
<deathRate>0.00</deathRate>
<gubun>10-19</gubun>
<seq>2025</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>3495</confCase>
<confCaseRate>25.79</confCaseRate>
<createDt>2020-07-15 10:40:12.153</createDt>
<criticalRate>0</criticalRate>
<death>0</death>
<deathRate>0.00</deathRate>
<gubun>20-29</gubun>
<seq>2024</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>1640</confCase>
<confCaseRate>12.1</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>0.12</criticalRate>
<death>2</death>
<deathRate>0.69</deathRate>
<gubun>30-39</gubun>
<seq>2023</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>1782</confCase>
<confCaseRate>13.15</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>0.17</criticalRate>
<death>3</death>
<deathRate>1.04</deathRate>
<gubun>40-49</gubun>
<seq>2022</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>2411</confCase>
<confCaseRate>17.79</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>0.62</criticalRate>
<death>15</death>
<deathRate>5.19</deathRate>
<gubun>50-59</gubun>
<seq>2021</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>1769</confCase>
<confCaseRate>13.05</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>2.32</criticalRate>
<death>41</death>
<deathRate>14.19</deathRate>
<gubun>60-69</gubun>
<seq>2020</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>902</confCase>
<confCaseRate>6.66</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>9.31</criticalRate>
<death>84</death>
<deathRate>29.07</deathRate>
<gubun>70-79</gubun>
<seq>2019</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>577</confCase>
<confCaseRate>4.26</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>24.96</criticalRate>
<death>144</death>
<deathRate>49.83</deathRate>
<gubun>80 이상</gubun>
<seq>2018</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>7607</confCase>
<confCaseRate>56.14</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>1.79</criticalRate>
<death>136</death>
<deathRate>47.06</deathRate>
<gubun>여성</gubun>
<seq>2017</seq>
<updateDt>null</updateDt>
</item>
<item>
<confCase>5944</confCase>
<confCaseRate>43.86</confCaseRate>
<createDt>2020-07-15 10:40:12.152</createDt>
<criticalRate>2.57</criticalRate>
<death>153</death>
<deathRate>52.94</deathRate>
<gubun>남성</gubun>
<seq>2016</seq>
<updateDt>null</updateDt>
</item>
</items>
<numOfRows>10</numOfRows>
<pageNo>1</pageNo>
<totalCount>11</totalCount>
</body>
</response>
데이터 구조를 변형하여 XML 을 JSON 으로 파싱해보겠습니다.
BeautifulSoup 모듈을 이용하면 tag에 접근하는 것이 간단합니다.
기본적인 형태.
# Tip
json.dump
indent='\t' : 간격을 지정합니다. (\t 은 탭을 의미)
ensure_ascii=False : 문자열 그대로 저장합니다. (True 이면, 한글 저장시 Unicode 로 인코딩)
'''
보건복지부_코로나19 연령별·성별감염_현황
'''
import bs4
from bs4 import BeautifulSoup as bs
import json
import xml.dom.minidom
with open('result.xml', 'r', encoding='utf-8') as f:
xml = bs(f.read(), 'html.parser')
json_data = []
for item in xml.items:
if isinstance(item, bs4.element.Tag):
data = {
'confcase': item.confcase.string,
'confcaserate': item.confcaserate.string,
'createdt': item.createdt.string,
'criticalrate': item.criticalrate.string,
'death': item.death.string,
'deathrate': item.deathrate.string,
'gubun': item.gubun.string,
'seq': item.seq.string,
'updatedt': item.updatedt.string
}
json_data.append(data)
with open('result.json', 'w', encoding='utf-8') as f:
json.dump(json_data, f, indent='\t', ensure_ascii=False)
더보기
[
{
"confcase": "222",
"confcaserate": "1.64",
"createdt": "2020-07-15 10:40:12.153",
"criticalrate": "0",
"death": "0",
"deathrate": "0.00",
"gubun": "0-9",
"seq": "2026",
"updatedt": "null"
},
{
"confcase": "753",
"confcaserate": "5.56",
"createdt": "2020-07-15 10:40:12.153",
"criticalrate": "0",
"death": "0",
"deathrate": "0.00",
"gubun": "10-19",
"seq": "2025",
"updatedt": "null"
},
{
"confcase": "3495",
"confcaserate": "25.79",
"createdt": "2020-07-15 10:40:12.153",
"criticalrate": "0",
"death": "0",
"deathrate": "0.00",
"gubun": "20-29",
"seq": "2024",
"updatedt": "null"
},
{
"confcase": "1640",
"confcaserate": "12.1",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "0.12",
"death": "2",
"deathrate": "0.69",
"gubun": "30-39",
"seq": "2023",
"updatedt": "null"
},
{
"confcase": "1782",
"confcaserate": "13.15",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "0.17",
"death": "3",
"deathrate": "1.04",
"gubun": "40-49",
"seq": "2022",
"updatedt": "null"
},
{
"confcase": "2411",
"confcaserate": "17.79",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "0.62",
"death": "15",
"deathrate": "5.19",
"gubun": "50-59",
"seq": "2021",
"updatedt": "null"
},
{
"confcase": "1769",
"confcaserate": "13.05",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "2.32",
"death": "41",
"deathrate": "14.19",
"gubun": "60-69",
"seq": "2020",
"updatedt": "null"
},
{
"confcase": "902",
"confcaserate": "6.66",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "9.31",
"death": "84",
"deathrate": "29.07",
"gubun": "70-79",
"seq": "2019",
"updatedt": "null"
},
{
"confcase": "577",
"confcaserate": "4.26",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "24.96",
"death": "144",
"deathrate": "49.83",
"gubun": "80 이상",
"seq": "2018",
"updatedt": "null"
},
{
"confcase": "7607",
"confcaserate": "56.14",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "1.79",
"death": "136",
"deathrate": "47.06",
"gubun": "여성",
"seq": "2017",
"updatedt": "null"
},
{
"confcase": "5944",
"confcaserate": "43.86",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "2.57",
"death": "153",
"deathrate": "52.94",
"gubun": "남성",
"seq": "2016",
"updatedt": "null"
}
]
gubun Key를 밖으로 빼서 해당 데이터 묶음을 식별하는게 좋아보입니다.
'''
보건복지부_코로나19 연령별·성별감염_현황
'''
import bs4
from bs4 import BeautifulSoup as bs
import json
import xml.dom.minidom
with open('result.xml', 'r', encoding='utf-8') as f:
xml = bs(f.read(), 'html.parser')
json_data = {}
for item in xml.items:
if isinstance(item, bs4.element.Tag):
json_data.setdefault(item.gubun.string, {
'confcase': item.confcase.string,
'confcaserate': item.confcaserate.string,
'createdt': item.createdt.string,
'criticalrate': item.criticalrate.string,
'death': item.death.string,
'deathrate': item.deathrate.string,
'seq': item.seq.string,
'updatedt': item.updatedt.string
})
with open('result2.json', 'w', encoding='utf-8') as f:
json.dump(json_data, f, indent='\t', ensure_ascii=False)
더보기
{
"0-9": {
"confcase": "222",
"confcaserate": "1.64",
"createdt": "2020-07-15 10:40:12.153",
"criticalrate": "0",
"death": "0",
"deathrate": "0.00",
"seq": "2026",
"updatedt": "null"
},
"10-19": {
"confcase": "753",
"confcaserate": "5.56",
"createdt": "2020-07-15 10:40:12.153",
"criticalrate": "0",
"death": "0",
"deathrate": "0.00",
"seq": "2025",
"updatedt": "null"
},
"20-29": {
"confcase": "3495",
"confcaserate": "25.79",
"createdt": "2020-07-15 10:40:12.153",
"criticalrate": "0",
"death": "0",
"deathrate": "0.00",
"seq": "2024",
"updatedt": "null"
},
"30-39": {
"confcase": "1640",
"confcaserate": "12.1",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "0.12",
"death": "2",
"deathrate": "0.69",
"seq": "2023",
"updatedt": "null"
},
"40-49": {
"confcase": "1782",
"confcaserate": "13.15",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "0.17",
"death": "3",
"deathrate": "1.04",
"seq": "2022",
"updatedt": "null"
},
"50-59": {
"confcase": "2411",
"confcaserate": "17.79",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "0.62",
"death": "15",
"deathrate": "5.19",
"seq": "2021",
"updatedt": "null"
},
"60-69": {
"confcase": "1769",
"confcaserate": "13.05",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "2.32",
"death": "41",
"deathrate": "14.19",
"seq": "2020",
"updatedt": "null"
},
"70-79": {
"confcase": "902",
"confcaserate": "6.66",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "9.31",
"death": "84",
"deathrate": "29.07",
"seq": "2019",
"updatedt": "null"
},
"80 이상": {
"confcase": "577",
"confcaserate": "4.26",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "24.96",
"death": "144",
"deathrate": "49.83",
"seq": "2018",
"updatedt": "null"
},
"여성": {
"confcase": "7607",
"confcaserate": "56.14",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "1.79",
"death": "136",
"deathrate": "47.06",
"seq": "2017",
"updatedt": "null"
},
"남성": {
"confcase": "5944",
"confcaserate": "43.86",
"createdt": "2020-07-15 10:40:12.152",
"criticalrate": "2.57",
"death": "153",
"deathrate": "52.94",
"seq": "2016",
"updatedt": "null"
}
}
데이터를 활용해서 그래프를 그려보고 싶지 않으신가요?
'API' 카테고리의 다른 글
공공데이터 포털 Open API 사용하기 : 외교부_국가·지역별 최신안전소식(코로나관련) -2 : 데이터 전처리 (0) | 2020.07.17 |
---|---|
공공데이터 포털 Open API 사용하기 : 외교부_국가·지역별 최신안전소식(코로나관련) -1 (0) | 2020.07.16 |
공공데이터 포털 Open API 사용하기 : 보건복지부_코로나19 연령별·성별감염_현황 -3 : 그래프 그리기 (1) | 2020.07.15 |
공공데이터 포털 Open API 사용하기 : 보건복지부_코로나19 연령별·성별감염_현황 (0) | 2020.07.15 |
공공데이터 포털 Open API 사용하기 : 활용신청 / 샘플코드 오류해결 (0) | 2020.07.15 |