gcp-hockey-results/motm_app/fixture_scraper.py

# encoding=utf-8
"""
Fixture scraper for Hong Kong Hockey Association website
Fetches upcoming HKFC C team fixtures from https://hockey.org.hk/MenFixture.asp
"""

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re


class FixtureScraper:
    """Scrapes fixture data from Hong Kong Hockey Association website"""

    FIXTURE_URL = "https://hockey.org.hk/MenFixture.asp"
    TARGET_TEAM = "HKFC C"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def fetch_fixtures(self):
        """Fetch and parse fixtures from the website"""
        try:
            response = self.session.get(self.FIXTURE_URL, timeout=10)
            response.raise_for_status()
            return self._parse_fixtures(response.text)
        except requests.RequestException as e:
            print(f"Error fetching fixtures: {e}")
            return []

    def _parse_fixtures(self, html_content):
        """Parse HTML content and extract fixture information"""
        soup = BeautifulSoup(html_content, 'lxml')
        fixtures = []

        # Find all table rows
        rows = soup.find_all('tr')
        current_date = None

        for row in rows:
            # Check if this row contains a date header
            date_cells = row.find_all('td', colspan=True)
            if date_cells:
                date_text = date_cells[0].get_text(strip=True)
                # Extract date from text like "Sunday, 7 Sep 2025"
                date_match = re.search(r'(\w+day),\s+(\d+)\s+(\w+)\s+(\d{4})', date_text)
                if date_match:
                    try:
                        day_name, day, month, year = date_match.groups()
                        date_str = f"{day} {month} {year}"
                        current_date = datetime.strptime(date_str, "%d %b %Y").date()
                    except ValueError:
                        continue
                continue

            # Check if this row contains fixture data
            cells = row.find_all('td')
            if len(cells) >= 5 and current_date:
                try:
                    # Extract fixture details
                    # Note: The first cell might be empty or contain status (C/P)
                    # Column order: [Status/Division], Division, Time, Venue, Home, Away, [Umpire columns...]

                    # Handle tables with or without status column
                    if len(cells) >= 6:
                        # If 6+ columns, likely has status column first
                        status_or_div = cells[0].get_text(strip=True)
                        division = cells[1].get_text(strip=True) if cells[1] else ""
                        time = cells[2].get_text(strip=True) if cells[2] else ""
                        venue = cells[3].get_text(strip=True) if cells[3] else ""
                        home_team = cells[4].get_text(strip=True) if cells[4] else ""
                        away_team = cells[5].get_text(strip=True) if cells[5] else ""
                    else:
                        # If 5 columns, no status column
                        division = cells[0].get_text(strip=True) if cells[0] else ""
                        time = cells[1].get_text(strip=True) if cells[1] else ""
                        venue = cells[2].get_text(strip=True) if cells[2] else ""
                        home_team = cells[3].get_text(strip=True) if cells[3] else ""
                        away_team = cells[4].get_text(strip=True) if cells[4] else ""

                    # Check if HKFC C is playing in this match
                    if self.TARGET_TEAM in home_team or self.TARGET_TEAM in away_team:
                        # Determine opponent
                        if self.TARGET_TEAM in home_team:
                            opponent = away_team
                            is_home = True
                        else:
                            opponent = home_team
                            is_home = False

                        fixture = {
                            'date': current_date,
                            'time': time,
                            'venue': venue,
                            'opponent': opponent,
                            'is_home': is_home,
                            'home_team': home_team,
                            'away_team': away_team,
                            'division': division
                        }
                        fixtures.append(fixture)
                except (IndexError, AttributeError) as e:
                    # Skip malformed rows
                    continue

        return fixtures

    def get_next_fixture(self):
        """Get the next upcoming HKFC C fixture"""
        fixtures = self.fetch_fixtures()

        if not fixtures:
            return None

        # Filter for future fixtures and sort by date
        today = datetime.now().date()
        future_fixtures = [f for f in fixtures if f['date'] >= today]

        if not future_fixtures:
            return None

        # Sort by date and return the earliest
        future_fixtures.sort(key=lambda x: x['date'])
        return future_fixtures[0]

    def get_all_future_fixtures(self, limit=10):
        """Get all future HKFC C fixtures, optionally limited"""
        fixtures = self.fetch_fixtures()

        if not fixtures:
            return []

        # Filter for future fixtures and sort by date
        today = datetime.now().date()
        future_fixtures = [f for f in fixtures if f['date'] >= today]
        future_fixtures.sort(key=lambda x: x['date'])

        return future_fixtures[:limit] if limit else future_fixtures


def get_next_hkfc_c_fixture():
    """Convenience function to get the next HKFC C fixture"""
    scraper = FixtureScraper()
    return scraper.get_next_fixture()


def get_opponent_club_name(opponent_team):
    """Extract club name from opponent team name (e.g., 'KCC B' -> 'KCC')"""
    if not opponent_team:
        return None

    # Common patterns: "Club Letter" (e.g., "KCC B", "Valley A")
    # Remove team letters and common suffixes
    club_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip()

    return club_name


def match_opponent_to_club(opponent_team, clubs_database=None):
    """
    Match an opponent team name to a club in the database

    Args:
        opponent_team (str): The opponent team name (e.g., "KCC B", "Valley A")
        clubs_database (list): List of clubs from database, if None will fetch from DB

    Returns:
        dict: Club information if matched, None if no match found
    """
    if not opponent_team:
        return None

    # Import here to avoid circular imports
    try:
        from db_config import sql_read
        from sqlalchemy import text
    except ImportError:
        return None

    # Get clubs from database if not provided
    if clubs_database is None:
        try:
            clubs_result = sql_read(text("SELECT hockey_club FROM clubs ORDER BY hockey_club"))
            clubs_database = [club['hockey_club'] for club in clubs_result] if clubs_result else []
        except:
            clubs_database = []

    # Extract potential club name from opponent team
    potential_club_names = []

    # Method 1: Remove team letters (A, B, C, etc.)
    base_name = re.sub(r'\s+[A-H]$', '', opponent_team).strip()
    potential_club_names.append(base_name)

    # Method 2: Remove common suffixes
    suffixes_to_remove = [' A', ' B', ' C', ' D', ' E', ' F', ' G', ' H', ' I', ' J']
    for suffix in suffixes_to_remove:
        if opponent_team.endswith(suffix):
            potential_club_names.append(opponent_team[:-len(suffix)].strip())

    # Method 3: Split on spaces and try different combinations
    words = opponent_team.split()
    if len(words) > 1:
        # Try first word only
        potential_club_names.append(words[0])
        # Try first two words
        if len(words) > 2:
            potential_club_names.append(' '.join(words[:2]))

    # Try to match against database clubs
    for potential_name in potential_club_names:
        # Exact match
        for club in clubs_database:
            if club.lower() == potential_name.lower():
                return {
                    'club_name': club,
                    'match_type': 'exact',
                    'confidence': 'high'
                }

        # Partial match (club name contains the potential name)
        for club in clubs_database:
            if potential_name.lower() in club.lower() or club.lower() in potential_name.lower():
                return {
                    'club_name': club,
                    'match_type': 'partial',
                    'confidence': 'medium'
                }

    # If no match found, return the best guess
    best_guess = potential_club_names[0] if potential_club_names else opponent_team
    return {
        'club_name': best_guess,
        'match_type': 'guess',
        'confidence': 'low'
    }


def get_opponent_club_info(opponent_team):
    """
    Get full club information for an opponent team

    Args:
        opponent_team (str): The opponent team name

    Returns:
        dict: Full club information including logo URL, or None if not found
    """
    if not opponent_team:
        return None

    try:
        from db_config import sql_read
        from sqlalchemy import text
    except ImportError:
        return None

    # First, try to match the opponent to a club
    match_result = match_opponent_to_club(opponent_team)

    if not match_result:
        return None

    club_name = match_result['club_name']

    # Get full club information from database
    try:
        sql = text("SELECT id, hockey_club, logo_url FROM clubs WHERE hockey_club = :club_name")
        club_info = sql_read(sql, {'club_name': club_name})

        if club_info:
            club_data = club_info[0]
            return {
                'id': club_data['id'],
                'club_name': club_data['hockey_club'],
                'logo_url': club_data['logo_url'],
                'match_result': match_result
            }
        else:
            # Club not found in database, return match result only
            return {
                'club_name': club_name,
                'logo_url': None,
                'match_result': match_result
            }
    except Exception as e:
        print(f"Error getting club info: {e}")
        return None


if __name__ == "__main__":
    # Test the scraper
    print("Testing Hong Kong Hockey Fixture Scraper...")
    print("=" * 60)

    scraper = FixtureScraper()

    print("\nFetching next HKFC C fixture...")
    next_fixture = scraper.get_next_fixture()

    if next_fixture:
        print(f"\nNext HKFC C Match:")
        print(f"  Date: {next_fixture['date'].strftime('%A, %d %B %Y')}")
        print(f"  Time: {next_fixture['time']}")
        print(f"  Venue: {next_fixture['venue']}")
        print(f"  Opponent: {next_fixture['opponent']}")
        print(f"  Home/Away: {'Home' if next_fixture['is_home'] else 'Away'}")
        print(f"  Division: {next_fixture['division']}")

        club_name = get_opponent_club_name(next_fixture['opponent'])
        print(f"  Opponent Club: {club_name}")
    else:
        print("\nNo upcoming fixtures found.")

    print("\n" + "=" * 60)
    print("\nFetching next 5 HKFC C fixtures...")
    future_fixtures = scraper.get_all_future_fixtures(limit=5)

    if future_fixtures:
        for i, fixture in enumerate(future_fixtures, 1):
            print(f"\n{i}. {fixture['date'].strftime('%d %b %Y')} vs {fixture['opponent']} ({fixture['venue']})")
    else:
        print("\nNo upcoming fixtures found.")