gcp-hockey-results/motm_app/club_scraper.py

# encoding=utf-8
"""
Club scraper for Hong Kong Hockey Association website
Fetches men's hockey clubs from https://hockey.org.hk
"""

import requests
from bs4 import BeautifulSoup
import re


class ClubScraper:
    """Scrapes club data from Hong Kong Hockey Association website"""

    CLUBS_URL = "https://hockey.org.hk/Content.asp?Uid=27"

    # Common club abbreviations and their full names
    CLUB_ABBREVIATIONS = {
        'Pak': 'Pakistan Association of HK Ltd.',
        'KCC': 'Kowloon Cricket Club',
        'HKFC': 'Hong Kong Football Club',
        'USRC': 'United Services Recreation Club',
        'Valley': 'Valley Fort Sports Club',
        'SSSC': 'South China Sports Club',
        'Dragons': 'Dragons Hockey Club',
        'Kai Tak': 'Kai Tak Sports Club',
        'RHOBA': 'Royal Hong Kong Regiment Officers and Businessmen Association',
        'Elite': 'Elite Hockey Club',
        'Aquila': 'Aquila Hockey Club',
        'HKJ': 'Hong Kong Jockey Club',
        'Sirius': 'Sirius Hockey Club',
        'Shaheen': 'Shaheen Hockey Club',
        'Diocesan': 'Diocesan Boys School',
        'Rhino': 'Rhino Hockey Club',
        'Khalsa': 'Khalsa Hockey Club',
        'HKCC': 'Hong Kong Cricket Club',
        'Police': 'Hong Kong Police Force',
        'Recreio': 'Recreio Hockey Club',
        'CSD': 'Correctional Services Department',
        'Dutch': 'Dutch Hockey Club',
        'HKUHC': 'Hong Kong University Hockey Club',
        'Kaitiaki': 'Kaitiaki Hockey Club',
        'Antlers': 'Antlers Hockey Club',
        'Marcellin': 'Marcellin Hockey Club',
        'Skyers': 'Skyers Hockey Club',
        'JR': 'JR Hockey Club',
        'IUHK': 'International University of Hong Kong',
        '144U': '144 United Hockey Club',
        'HKU': 'Hong Kong University',
        'UBSC': 'United Brother Sports Club',
        'Nanki': 'Nanki Sports Club',
        'Gojra': 'Gojra Hockey Club',
        'KNS': 'KNS Hockey Club',
        'Hockey Clube de Macau': 'Hockey Clube de Macau'
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def fetch_clubs(self):
        """Fetch and parse clubs from the website"""
        try:
            response = self.session.get(self.CLUBS_URL, timeout=10)
            response.raise_for_status()
            return self._parse_clubs(response.text)
        except requests.RequestException as e:
            print(f"Error fetching clubs: {e}")
            return []

    def _parse_clubs(self, html_content):
        """Parse HTML content and extract club information"""
        soup = BeautifulSoup(html_content, 'lxml')
        clubs = []

        # Look for tables or structured data containing club information
        tables = soup.find_all('table')

        for table in tables:
            rows = table.find_all('tr')
            for row in rows:
                cells = row.find_all(['td', 'th'])
                if len(cells) >= 2:
                    # Extract club name from first cell
                    club_name = cells[0].get_text(strip=True)

                    # Skip header rows and empty cells
                    if not club_name or club_name.lower() in ['club', 'name', 'abbreviation', 'team', 'clubs']:
                        continue

                    # Skip if it's clearly a header row
                    if club_name == 'Clubs' and abbreviation == 'Abbreviated Title':
                        continue

                    # Extract abbreviation if available
                    abbreviation = None
                    if len(cells) > 1:
                        abbreviation = cells[1].get_text(strip=True)

                    # Extract teams if available
                    teams = []
                    if len(cells) > 2:
                        teams_text = cells[2].get_text(strip=True)
                        # Parse teams (e.g., "A, B" or "A B")
                        if teams_text:
                            teams = [team.strip() for team in re.split(r'[,;]', teams_text) if team.strip()]

                    # Extract convenor if available
                    convenor = None
                    if len(cells) > 3:
                        convenor = cells[3].get_text(strip=True)

                    # Extract email if available
                    email = None
                    if len(cells) > 4:
                        email = cells[4].get_text(strip=True)

                    club_data = {
                        'name': club_name,
                        'abbreviation': abbreviation,
                        'teams': teams,
                        'convenor': convenor,
                        'email': email
                    }
                    clubs.append(club_data)

        # If no structured data found, try to extract from text content
        if not clubs:
            clubs = self._extract_clubs_from_text(html_content)

        return clubs

    def _extract_clubs_from_text(self, html_content):
        """Extract club names from text content if no structured data found"""
        soup = BeautifulSoup(html_content, 'lxml')
        clubs = []

        # Look for common patterns in text
        text_content = soup.get_text()

        # Extract known club names from the text
        for abbreviation, full_name in self.CLUB_ABBREVIATIONS.items():
            if abbreviation in text_content or full_name in text_content:
                clubs.append({
                    'name': full_name,
                    'abbreviation': abbreviation,
                    'teams': [],
                    'convenor': None,
                    'email': None
                })

        return clubs

    def get_clubs_with_abbreviations(self):
        """Get clubs with proper abbreviation handling"""
        clubs = self.fetch_clubs()

        # Process clubs to handle abbreviations
        processed_clubs = []

        for club in clubs:
            name = club['name']
            abbreviation = club.get('abbreviation', '')

            # If we have an abbreviation, check if it's in our mapping
            if abbreviation and abbreviation in self.CLUB_ABBREVIATIONS:
                full_name = self.CLUB_ABBREVIATIONS[abbreviation]
                processed_club = club.copy()
                processed_club['name'] = full_name
                processed_club['abbreviation'] = abbreviation
                processed_clubs.append(processed_club)
            elif name in self.CLUB_ABBREVIATIONS.values():
                # If the name is already a full name, find its abbreviation
                for abbr, full in self.CLUB_ABBREVIATIONS.items():
                    if full == name:
                        processed_club = club.copy()
                        processed_club['abbreviation'] = abbr
                        processed_clubs.append(processed_club)
                        break
            else:
                # Keep as-is if no mapping found
                processed_clubs.append(club)

        return processed_clubs

    def get_club_logo_url(self, club_name):
        """Generate a logo URL for a club (placeholder implementation)"""
        # This could be enhanced to fetch actual logos from the website
        # For now, return a placeholder
        club_slug = club_name.lower().replace(' ', '_').replace('.', '').replace(',', '')
        return f"/static/images/clubs/{club_slug}_logo.png"


def get_hk_hockey_clubs():
    """Convenience function to get Hong Kong hockey clubs"""
    scraper = ClubScraper()
    return scraper.get_clubs_with_abbreviations()


def expand_club_abbreviation(abbreviation):
    """Expand a club abbreviation to its full name"""
    return ClubScraper.CLUB_ABBREVIATIONS.get(abbreviation, abbreviation)


if __name__ == "__main__":
    # Test the scraper
    print("Testing Hong Kong Hockey Club Scraper...")
    print("=" * 60)

    scraper = ClubScraper()

    print("\nFetching clubs from Hockey Hong Kong website...")
    clubs = scraper.get_clubs_with_abbreviations()

    if clubs:
        print(f"\nFound {len(clubs)} clubs:")
        for i, club in enumerate(clubs, 1):
            print(f"\n{i}. {club['name']}")
            if club.get('abbreviation'):
                print(f"   Abbreviation: {club['abbreviation']}")
            if club.get('teams'):
                print(f"   Teams: {', '.join(club['teams'])}")
            if club.get('convenor'):
                print(f"   Convenor: {club['convenor']}")
            if club.get('email'):
                print(f"   Email: {club['email']}")
    else:
        print("\nNo clubs found. This might be due to website structure changes.")
        print("Using fallback club list...")

        # Fallback to known clubs
        for abbreviation, full_name in scraper.CLUB_ABBREVIATIONS.items():
            print(f"- {full_name} ({abbreviation})")

    print("\n" + "=" * 60)