# encoding=utf-8 """ Club scraper for Hong Kong Hockey Association website Fetches men's hockey clubs from https://hockey.org.hk """ import requests from bs4 import BeautifulSoup import re class ClubScraper: """Scrapes club data from Hong Kong Hockey Association website""" CLUBS_URL = "https://hockey.org.hk/Content.asp?Uid=27" # Common club abbreviations and their full names CLUB_ABBREVIATIONS = { 'Pak': 'Pakistan Association of HK Ltd.', 'KCC': 'Kowloon Cricket Club', 'HKFC': 'Hong Kong Football Club', 'USRC': 'United Services Recreation Club', 'Valley': 'Valley Fort Sports Club', 'SSSC': 'South China Sports Club', 'Dragons': 'Dragons Hockey Club', 'Kai Tak': 'Kai Tak Sports Club', 'RHOBA': 'Royal Hong Kong Regiment Officers and Businessmen Association', 'Elite': 'Elite Hockey Club', 'Aquila': 'Aquila Hockey Club', 'HKJ': 'Hong Kong Jockey Club', 'Sirius': 'Sirius Hockey Club', 'Shaheen': 'Shaheen Hockey Club', 'Diocesan': 'Diocesan Boys School', 'Rhino': 'Rhino Hockey Club', 'Khalsa': 'Khalsa Hockey Club', 'HKCC': 'Hong Kong Cricket Club', 'Police': 'Hong Kong Police Force', 'Recreio': 'Recreio Hockey Club', 'CSD': 'Correctional Services Department', 'Dutch': 'Dutch Hockey Club', 'HKUHC': 'Hong Kong University Hockey Club', 'Kaitiaki': 'Kaitiaki Hockey Club', 'Antlers': 'Antlers Hockey Club', 'Marcellin': 'Marcellin Hockey Club', 'Skyers': 'Skyers Hockey Club', 'JR': 'JR Hockey Club', 'IUHK': 'International University of Hong Kong', '144U': '144 United Hockey Club', 'HKU': 'Hong Kong University', 'UBSC': 'United Brother Sports Club', 'Nanki': 'Nanki Sports Club', 'Gojra': 'Gojra Hockey Club', 'KNS': 'KNS Hockey Club', 'Hockey Clube de Macau': 'Hockey Clube de Macau' } def __init__(self): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }) def fetch_clubs(self): """Fetch and parse clubs from the website""" try: response = self.session.get(self.CLUBS_URL, timeout=10) response.raise_for_status() return self._parse_clubs(response.text) except requests.RequestException as e: print(f"Error fetching clubs: {e}") return [] def _parse_clubs(self, html_content): """Parse HTML content and extract club information""" soup = BeautifulSoup(html_content, 'lxml') clubs = [] # Look for tables or structured data containing club information tables = soup.find_all('table') for table in tables: rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) if len(cells) >= 2: # Extract club name from first cell club_name = cells[0].get_text(strip=True) # Skip header rows and empty cells if not club_name or club_name.lower() in ['club', 'name', 'abbreviation', 'team', 'clubs']: continue # Skip if it's clearly a header row if club_name == 'Clubs' and abbreviation == 'Abbreviated Title': continue # Extract abbreviation if available abbreviation = None if len(cells) > 1: abbreviation = cells[1].get_text(strip=True) # Extract teams if available teams = [] if len(cells) > 2: teams_text = cells[2].get_text(strip=True) # Parse teams (e.g., "A, B" or "A B") if teams_text: teams = [team.strip() for team in re.split(r'[,;]', teams_text) if team.strip()] # Extract convenor if available convenor = None if len(cells) > 3: convenor = cells[3].get_text(strip=True) # Extract email if available email = None if len(cells) > 4: email = cells[4].get_text(strip=True) club_data = { 'name': club_name, 'abbreviation': abbreviation, 'teams': teams, 'convenor': convenor, 'email': email } clubs.append(club_data) # If no structured data found, try to extract from text content if not clubs: clubs = self._extract_clubs_from_text(html_content) return clubs def _extract_clubs_from_text(self, html_content): """Extract club names from text content if no structured data found""" soup = BeautifulSoup(html_content, 'lxml') clubs = [] # Look for common patterns in text text_content = soup.get_text() # Extract known club names from the text for abbreviation, full_name in self.CLUB_ABBREVIATIONS.items(): if abbreviation in text_content or full_name in text_content: clubs.append({ 'name': full_name, 'abbreviation': abbreviation, 'teams': [], 'convenor': None, 'email': None }) return clubs def get_clubs_with_abbreviations(self): """Get clubs with proper abbreviation handling""" clubs = self.fetch_clubs() # Process clubs to handle abbreviations processed_clubs = [] for club in clubs: name = club['name'] abbreviation = club.get('abbreviation', '') # If we have an abbreviation, check if it's in our mapping if abbreviation and abbreviation in self.CLUB_ABBREVIATIONS: full_name = self.CLUB_ABBREVIATIONS[abbreviation] processed_club = club.copy() processed_club['name'] = full_name processed_club['abbreviation'] = abbreviation processed_clubs.append(processed_club) elif name in self.CLUB_ABBREVIATIONS.values(): # If the name is already a full name, find its abbreviation for abbr, full in self.CLUB_ABBREVIATIONS.items(): if full == name: processed_club = club.copy() processed_club['abbreviation'] = abbr processed_clubs.append(processed_club) break else: # Keep as-is if no mapping found processed_clubs.append(club) return processed_clubs def get_club_logo_url(self, club_name): """Generate a logo URL for a club (placeholder implementation)""" # This could be enhanced to fetch actual logos from the website # For now, return a placeholder club_slug = club_name.lower().replace(' ', '_').replace('.', '').replace(',', '') return f"/static/images/clubs/{club_slug}_logo.png" def get_hk_hockey_clubs(): """Convenience function to get Hong Kong hockey clubs""" scraper = ClubScraper() return scraper.get_clubs_with_abbreviations() def expand_club_abbreviation(abbreviation): """Expand a club abbreviation to its full name""" return ClubScraper.CLUB_ABBREVIATIONS.get(abbreviation, abbreviation) if __name__ == "__main__": # Test the scraper print("Testing Hong Kong Hockey Club Scraper...") print("=" * 60) scraper = ClubScraper() print("\nFetching clubs from Hockey Hong Kong website...") clubs = scraper.get_clubs_with_abbreviations() if clubs: print(f"\nFound {len(clubs)} clubs:") for i, club in enumerate(clubs, 1): print(f"\n{i}. {club['name']}") if club.get('abbreviation'): print(f" Abbreviation: {club['abbreviation']}") if club.get('teams'): print(f" Teams: {', '.join(club['teams'])}") if club.get('convenor'): print(f" Convenor: {club['convenor']}") if club.get('email'): print(f" Email: {club['email']}") else: print("\nNo clubs found. This might be due to website structure changes.") print("Using fallback club list...") # Fallback to known clubs for abbreviation, full_name in scraper.CLUB_ABBREVIATIONS.items(): print(f"- {full_name} ({abbreviation})") print("\n" + "=" * 60)