Source code for dancesport_parser.o2cm.parser

import datetime
import re
from os import path
from typing import List

from bs4 import BeautifulSoup

from dancesport_parser.util import parseHtml
from dancesport_parser.o2cm.model import Competition


RESULTS_MAIN_DOMAIN = "https://results.o2cm.com/"



[docs]
def parseMain(htmlDOM: BeautifulSoup = None, rawHtml: str = None) -> List[Competition]:
    """Parse o2cm main results screen, i.e. results.o2cm.com.
    
    :param htmlDOM: HTML contents to parse.
    :param rawHtml: If `htmlDOM` is unspecified, raw string to directly parse.
    :return: List of Competitions parsed from HTML
    """
    if htmlDOM is None and rawHtml is None:
        raise RuntimeError("Expected either htmlDOM or rawHtml to be provided.")
    
    if htmlDOM is None:
        htmlDOM = parseHtml(rawHtml)

    results: List[Competition] = []
    competitionsTable = htmlDOM.find_all("table", id="main_tbl")[0]
    yearInput = htmlDOM.find_all('input', id='inyear')[0]
    year = int(yearInput['value'])
    for row in competitionsTable.find_all("tr"):
        rowData = row.find_all("td")
        if rowData is None or len(rowData) == 0:
            continue

        if "class" in rowData[0].attrs and rowData[0]["class"][0] == "h3":
            year = int(rowData[0].get_text().strip())
        elif "class" in row.attrs and row["class"][0] == "t1n":
            date = str(rowData[0].get_text().strip())
            compName = rowData[1].get_text().strip()
            compUrl = row.find("a")["href"]
            matchCompUrl = re.match(r'event[23].asp\?event=([a-zA-Z]{0,4}\d{0,5}[a-zA-Z]?)&.*', compUrl)
            compId = matchCompUrl.group(1).lower()
            fullDate = date + " " + str(year)
            compDate = datetime.datetime.strptime(fullDate, "%b %d %Y").date()
            competition = Competition(compId, compName, compDate, path.join(RESULTS_MAIN_DOMAIN, compUrl))
            results.append(competition)
    return results