728x90
문제
선수 리스트에 시세 데이터를 추출하려던 중, 다음과 같은 형태의 태그들을 만났습니다.
다음과 같이 일반적으로는 추출하지 못합니다.
from bs4 import BeautifulSoup as bs
with open('sample.html', 'r', encoding='utf-8') as f:
text = f.read()
html = bs(text, 'html.parser')
items = html.find_all('div', {'class': 'tr'})
print(f'item count: {len(items)}')
result = []
for item in items:
bp = [t.get_text() for t in item.find_all('span', {'class': 'span_bp'})]
result.append(bp)
print(result)
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
정규표현식
다음과 같이 re.compile(정규표현식) 으로 사용하시면 됩니다.
정규표현식에서 \d 는 숫자를 나타내고 \d{1,2} 는 숫자가 1~2번 나타난다는 것을 나타냅니다.
import re
import requests
from bs4 import BeautifulSoup as bs
import os
with open('sample.html', 'r', encoding='utf-8') as f:
text = f.read()
html = bs(text, 'html.parser')
data = {}
for tr in html.find_all('div', {'class': 'tr'}):
icon_url = tr.find('div', {'class': 'season'}).img['src']
_,icon_fname = os.path.split(icon_url)
item_id = icon_fname.split('.')[0]
bp = [t.get_text() for t in tr.find_all('span', {'class': re.compile('span_bp\d{1,2}')})]
data.setdefault(item_id, bp)
import pprint
pprint.pprint(data)
더보기
{'18PLS': ['-',
'13,500,000',
'18,900,000',
'28,400,000',
'45,400,000',
'90,800,000',
'200,000,000',
'542,000,000',
'2,900,000,000',
'9,910,000,000',
'31,700,000,000'],
'19PLA': ['-',
'2,010,000',
'2,810,000',
'4,220,000',
'7,000,000',
'14,200,000',
'113,000,000',
'545,000,000',
'4,720,000,000',
'15,400,000,000',
'49,300,000,000'],
'19PLS': ['-',
'21,200,000',
'32,500,000',
'48,800,000',
'78,100,000',
'156,000,000',
'356,000,000',
'1,550,000,000',
'15,500,000,000',
'64,100,000,000',
'205,000,000,000'],
'19TOTS': ['-',
'906,000,000',
'1,270,000,000',
'1,910,000,000',
'3,060,000,000',
'6,120,000,000',
'13,500,000,000',
'32,400,000,000',
'89,200,000,000',
'250,000,000,000',
'800,000,000,000'],
'19UCL': ['-',
'41,700,000',
'78,500,000',
'238,000,000',
'686,000,000',
'2,440,000,000',
'5,480,000,000',
'31,200,000,000',
'81,100,000,000',
'227,000,000,000',
'726,000,000,000'],
'2019KFA': ['-',
'2,120,000',
'2,970,000',
'4,460,000',
'10,200,000',
'36,800,000',
'302,000,000',
'1,970,000,000',
'16,600,000,000',
'58,000,000,000',
'186,000,000,000'],
'20TOTN': ['-',
'677,000,000',
'948,000,000',
'1,420,000,000',
'2,270,000,000',
'5,500,000,000',
'13,100,000,000',
'46,100,000,000',
'120,000,000,000',
'336,000,000,000',
'1,080,000,000,000'],
'20TOTS': ['-',
'1,490,000,000',
'2,090,000,000',
'3,140,000,000',
'5,020,000,000',
'10,900,000,000',
'24,900,000,000',
'66,500,000,000',
'173,000,000,000',
'484,000,000,000',
'1,550,000,000,000'],
'KFA': ['-',
'1,220,000',
'2,020,000',
'3,040,000',
'5,930,000',
'14,300,000',
'143,000,000',
'710,000,000',
'4,000,000,000',
'21,000,000,000',
'89,300,000,000'],
'LH': ['-',
'255,000,000',
'357,000,000',
'536,000,000',
'858,000,000',
'1,720,000,000',
'3,780,000,000',
'9,070,000,000',
'42,100,000,000',
'118,000,000,000',
'378,000,000,000'],
'LIVE': ['-',
'1,850,000',
'2,710,000',
'4,650,000',
'11,300,000',
'22,700,000',
'95,500,000',
'317,000,000',
'824,000,000',
'5,120,000,000',
'18,200,000,000'],
'NHD': ['-',
'2,500,000',
'3,570,000',
'5,490,000',
'8,780,000',
'17,600,000',
'44,000,000',
'237,000,000',
'616,000,000',
'2,390,000,000',
'8,960,000,000'],
'PLC': ['-',
'2,550,000',
'3,820,000',
'5,740,000',
'9,900,000',
'19,800,000',
'46,900,000',
'180,000,000',
'869,000,000',
'5,170,000,000',
'17,700,000,000'],
'TB': ['-',
'1,240,000',
'1,560,000',
'2,440,000',
'10,300,000',
'32,300,000',
'181,000,000',
'535,000,000',
'3,260,000,000',
'12,200,000,000',
'79,000,000,000'],
'TT': ['-',
'6,690,000',
'9,370,000',
'14,100,000',
'22,600,000',
'45,200,000',
'127,000,000',
'400,000,000',
'1,040,000,000',
'4,630,000,000',
'14,800,000,000']}
'Crawling > Basic' 카테고리의 다른 글
파이썬 다음-뉴스 댓글 크롤링 하기 -2 : 동적인 데이터를 수집하는 방법 (0) | 2020.08.09 |
---|---|
파이썬 다음-뉴스 크롤링 하기 -1 (0) | 2020.08.07 |
파이썬 크롤링할 범위를 구하는 방법 (0) | 2020.08.07 |
아이피 우회하여 크롤링하기 tor / requests[socks] (0) | 2020.08.03 |
파이썬 구글 날씨 검색기(크롤러) 만들기 : 모듈화와 핵심 팁 (0) | 2020.07.22 |