본문으로 바로가기

정규표현식으로 태그 추출하기

category Crawling/Basic 2020. 7. 27. 14:35
728x90

문제

선수 리스트에 시세 데이터를 추출하려던 중, 다음과 같은 형태의 태그들을 만났습니다. 

게임 사이트의 선수 검색 결과입니다.

다음과 같이 일반적으로는 추출하지 못합니다.

from bs4 import BeautifulSoup as bs

with open('sample.html', 'r', encoding='utf-8') as f:
    text = f.read()

html = bs(text, 'html.parser')

items = html.find_all('div', {'class': 'tr'})
print(f'item count: {len(items)}')
result = []
for item in items:
    bp = [t.get_text() for t in item.find_all('span', {'class': 'span_bp'})]
    result.append(bp)

print(result)
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], []]

 

정규표현식

다음과 같이 re.compile(정규표현식) 으로 사용하시면 됩니다.

정규표현식에서 \d숫자를 나타내고 \d{1,2}숫자가 1~2번 나타난다는 것을 나타냅니다. 

import re
import requests
from bs4 import BeautifulSoup as bs
import os

with open('sample.html', 'r', encoding='utf-8') as f:
    text = f.read()

html = bs(text, 'html.parser')

data = {}
for tr in html.find_all('div', {'class': 'tr'}):
    icon_url = tr.find('div', {'class': 'season'}).img['src']
    _,icon_fname = os.path.split(icon_url)
    item_id = icon_fname.split('.')[0]
    bp = [t.get_text() for t in tr.find_all('span', {'class': re.compile('span_bp\d{1,2}')})]
    data.setdefault(item_id, bp)

import pprint
pprint.pprint(data)
더보기
{'18PLS': ['-',
           '13,500,000',
           '18,900,000',
           '28,400,000',
           '45,400,000',
           '90,800,000',
           '200,000,000',
           '542,000,000',
           '2,900,000,000',
           '9,910,000,000',
           '31,700,000,000'],
 '19PLA': ['-',
           '2,010,000',
           '2,810,000',
           '4,220,000',
           '7,000,000',
           '14,200,000',
           '113,000,000',
           '545,000,000',
           '4,720,000,000',
           '15,400,000,000',
           '49,300,000,000'],
 '19PLS': ['-',
           '21,200,000',
           '32,500,000',
           '48,800,000',
           '78,100,000',
           '156,000,000',
           '356,000,000',
           '1,550,000,000',
           '15,500,000,000',
           '64,100,000,000',
           '205,000,000,000'],
 '19TOTS': ['-',
            '906,000,000',
            '1,270,000,000',
            '1,910,000,000',
            '3,060,000,000',
            '6,120,000,000',
            '13,500,000,000',
            '32,400,000,000',
            '89,200,000,000',
            '250,000,000,000',
            '800,000,000,000'],
 '19UCL': ['-',
           '41,700,000',
           '78,500,000',
           '238,000,000',
           '686,000,000',
           '2,440,000,000',
           '5,480,000,000',
           '31,200,000,000',
           '81,100,000,000',
           '227,000,000,000',
           '726,000,000,000'],
 '2019KFA': ['-',
             '2,120,000',
             '2,970,000',
             '4,460,000',
             '10,200,000',
             '36,800,000',
             '302,000,000',
             '1,970,000,000',
             '16,600,000,000',
             '58,000,000,000',
             '186,000,000,000'],
 '20TOTN': ['-',
            '677,000,000',
            '948,000,000',
            '1,420,000,000',
            '2,270,000,000',
            '5,500,000,000',
            '13,100,000,000',
            '46,100,000,000',
            '120,000,000,000',
            '336,000,000,000',
            '1,080,000,000,000'],
 '20TOTS': ['-',
            '1,490,000,000',
            '2,090,000,000',
            '3,140,000,000',
            '5,020,000,000',
            '10,900,000,000',
            '24,900,000,000',
            '66,500,000,000',
            '173,000,000,000',
            '484,000,000,000',
            '1,550,000,000,000'],
 'KFA': ['-',
         '1,220,000',
         '2,020,000',
         '3,040,000',
         '5,930,000',
         '14,300,000',
         '143,000,000',
         '710,000,000',
         '4,000,000,000',
         '21,000,000,000',
         '89,300,000,000'],
 'LH': ['-',
        '255,000,000',
        '357,000,000',
        '536,000,000',
        '858,000,000',
        '1,720,000,000',
        '3,780,000,000',
        '9,070,000,000',
        '42,100,000,000',
        '118,000,000,000',
        '378,000,000,000'],
 'LIVE': ['-',
          '1,850,000',
          '2,710,000',
          '4,650,000',
          '11,300,000',
          '22,700,000',
          '95,500,000',
          '317,000,000',
          '824,000,000',
          '5,120,000,000',
          '18,200,000,000'],
 'NHD': ['-',
         '2,500,000',
         '3,570,000',
         '5,490,000',
         '8,780,000',
         '17,600,000',
         '44,000,000',
         '237,000,000',
         '616,000,000',
         '2,390,000,000',
         '8,960,000,000'],
 'PLC': ['-',
         '2,550,000',
         '3,820,000',
         '5,740,000',
         '9,900,000',
         '19,800,000',
         '46,900,000',
         '180,000,000',
         '869,000,000',
         '5,170,000,000',
         '17,700,000,000'],
 'TB': ['-',
        '1,240,000',
        '1,560,000',
        '2,440,000',
        '10,300,000',
        '32,300,000',
        '181,000,000',
        '535,000,000',
        '3,260,000,000',
        '12,200,000,000',
        '79,000,000,000'],
 'TT': ['-',
        '6,690,000',
        '9,370,000',
        '14,100,000',
        '22,600,000',
        '45,200,000',
        '127,000,000',
        '400,000,000',
        '1,040,000,000',
        '4,630,000,000',
        '14,800,000,000']}