238 lines
9.1 KiB
Python
238 lines
9.1 KiB
Python
# encoding=utf-8
|
|
"""
|
|
Club scraper for Hong Kong Hockey Association website
|
|
Fetches men's hockey clubs from https://hockey.org.hk
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
|
|
|
|
class ClubScraper:
|
|
"""Scrapes club data from Hong Kong Hockey Association website"""
|
|
|
|
CLUBS_URL = "https://hockey.org.hk/Content.asp?Uid=27"
|
|
|
|
# Common club abbreviations and their full names
|
|
CLUB_ABBREVIATIONS = {
|
|
'Pak': 'Pakistan Association of HK Ltd.',
|
|
'KCC': 'Kowloon Cricket Club',
|
|
'HKFC': 'Hong Kong Football Club',
|
|
'USRC': 'United Services Recreation Club',
|
|
'Valley': 'Valley Fort Sports Club',
|
|
'SSSC': 'South China Sports Club',
|
|
'Dragons': 'Dragons Hockey Club',
|
|
'Kai Tak': 'Kai Tak Sports Club',
|
|
'RHOBA': 'Royal Hong Kong Regiment Officers and Businessmen Association',
|
|
'Elite': 'Elite Hockey Club',
|
|
'Aquila': 'Aquila Hockey Club',
|
|
'HKJ': 'Hong Kong Jockey Club',
|
|
'Sirius': 'Sirius Hockey Club',
|
|
'Shaheen': 'Shaheen Hockey Club',
|
|
'Diocesan': 'Diocesan Boys School',
|
|
'Rhino': 'Rhino Hockey Club',
|
|
'Khalsa': 'Khalsa Hockey Club',
|
|
'HKCC': 'Hong Kong Cricket Club',
|
|
'Police': 'Hong Kong Police Force',
|
|
'Recreio': 'Recreio Hockey Club',
|
|
'CSD': 'Correctional Services Department',
|
|
'Dutch': 'Dutch Hockey Club',
|
|
'HKUHC': 'Hong Kong University Hockey Club',
|
|
'Kaitiaki': 'Kaitiaki Hockey Club',
|
|
'Antlers': 'Antlers Hockey Club',
|
|
'Marcellin': 'Marcellin Hockey Club',
|
|
'Skyers': 'Skyers Hockey Club',
|
|
'JR': 'JR Hockey Club',
|
|
'IUHK': 'International University of Hong Kong',
|
|
'144U': '144 United Hockey Club',
|
|
'HKU': 'Hong Kong University',
|
|
'UBSC': 'United Brother Sports Club',
|
|
'Nanki': 'Nanki Sports Club',
|
|
'Gojra': 'Gojra Hockey Club',
|
|
'KNS': 'KNS Hockey Club',
|
|
'Hockey Clube de Macau': 'Hockey Clube de Macau'
|
|
}
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
})
|
|
|
|
def fetch_clubs(self):
|
|
"""Fetch and parse clubs from the website"""
|
|
try:
|
|
response = self.session.get(self.CLUBS_URL, timeout=10)
|
|
response.raise_for_status()
|
|
return self._parse_clubs(response.text)
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching clubs: {e}")
|
|
return []
|
|
|
|
def _parse_clubs(self, html_content):
|
|
"""Parse HTML content and extract club information"""
|
|
soup = BeautifulSoup(html_content, 'lxml')
|
|
clubs = []
|
|
|
|
# Look for tables or structured data containing club information
|
|
tables = soup.find_all('table')
|
|
|
|
for table in tables:
|
|
rows = table.find_all('tr')
|
|
for row in rows:
|
|
cells = row.find_all(['td', 'th'])
|
|
if len(cells) >= 2:
|
|
# Extract club name from first cell
|
|
club_name = cells[0].get_text(strip=True)
|
|
|
|
# Skip header rows and empty cells
|
|
if not club_name or club_name.lower() in ['club', 'name', 'abbreviation', 'team', 'clubs']:
|
|
continue
|
|
|
|
# Skip if it's clearly a header row
|
|
if club_name == 'Clubs' and abbreviation == 'Abbreviated Title':
|
|
continue
|
|
|
|
# Extract abbreviation if available
|
|
abbreviation = None
|
|
if len(cells) > 1:
|
|
abbreviation = cells[1].get_text(strip=True)
|
|
|
|
# Extract teams if available
|
|
teams = []
|
|
if len(cells) > 2:
|
|
teams_text = cells[2].get_text(strip=True)
|
|
# Parse teams (e.g., "A, B" or "A B")
|
|
if teams_text:
|
|
teams = [team.strip() for team in re.split(r'[,;]', teams_text) if team.strip()]
|
|
|
|
# Extract convenor if available
|
|
convenor = None
|
|
if len(cells) > 3:
|
|
convenor = cells[3].get_text(strip=True)
|
|
|
|
# Extract email if available
|
|
email = None
|
|
if len(cells) > 4:
|
|
email = cells[4].get_text(strip=True)
|
|
|
|
club_data = {
|
|
'name': club_name,
|
|
'abbreviation': abbreviation,
|
|
'teams': teams,
|
|
'convenor': convenor,
|
|
'email': email
|
|
}
|
|
clubs.append(club_data)
|
|
|
|
# If no structured data found, try to extract from text content
|
|
if not clubs:
|
|
clubs = self._extract_clubs_from_text(html_content)
|
|
|
|
return clubs
|
|
|
|
def _extract_clubs_from_text(self, html_content):
|
|
"""Extract club names from text content if no structured data found"""
|
|
soup = BeautifulSoup(html_content, 'lxml')
|
|
clubs = []
|
|
|
|
# Look for common patterns in text
|
|
text_content = soup.get_text()
|
|
|
|
# Extract known club names from the text
|
|
for abbreviation, full_name in self.CLUB_ABBREVIATIONS.items():
|
|
if abbreviation in text_content or full_name in text_content:
|
|
clubs.append({
|
|
'name': full_name,
|
|
'abbreviation': abbreviation,
|
|
'teams': [],
|
|
'convenor': None,
|
|
'email': None
|
|
})
|
|
|
|
return clubs
|
|
|
|
def get_clubs_with_abbreviations(self):
|
|
"""Get clubs with proper abbreviation handling"""
|
|
clubs = self.fetch_clubs()
|
|
|
|
# Process clubs to handle abbreviations
|
|
processed_clubs = []
|
|
|
|
for club in clubs:
|
|
name = club['name']
|
|
abbreviation = club.get('abbreviation', '')
|
|
|
|
# If we have an abbreviation, check if it's in our mapping
|
|
if abbreviation and abbreviation in self.CLUB_ABBREVIATIONS:
|
|
full_name = self.CLUB_ABBREVIATIONS[abbreviation]
|
|
processed_club = club.copy()
|
|
processed_club['name'] = full_name
|
|
processed_club['abbreviation'] = abbreviation
|
|
processed_clubs.append(processed_club)
|
|
elif name in self.CLUB_ABBREVIATIONS.values():
|
|
# If the name is already a full name, find its abbreviation
|
|
for abbr, full in self.CLUB_ABBREVIATIONS.items():
|
|
if full == name:
|
|
processed_club = club.copy()
|
|
processed_club['abbreviation'] = abbr
|
|
processed_clubs.append(processed_club)
|
|
break
|
|
else:
|
|
# Keep as-is if no mapping found
|
|
processed_clubs.append(club)
|
|
|
|
return processed_clubs
|
|
|
|
def get_club_logo_url(self, club_name):
|
|
"""Generate a logo URL for a club (placeholder implementation)"""
|
|
# This could be enhanced to fetch actual logos from the website
|
|
# For now, return a placeholder
|
|
club_slug = club_name.lower().replace(' ', '_').replace('.', '').replace(',', '')
|
|
return f"/static/images/clubs/{club_slug}_logo.png"
|
|
|
|
|
|
def get_hk_hockey_clubs():
|
|
"""Convenience function to get Hong Kong hockey clubs"""
|
|
scraper = ClubScraper()
|
|
return scraper.get_clubs_with_abbreviations()
|
|
|
|
|
|
def expand_club_abbreviation(abbreviation):
|
|
"""Expand a club abbreviation to its full name"""
|
|
return ClubScraper.CLUB_ABBREVIATIONS.get(abbreviation, abbreviation)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test the scraper
|
|
print("Testing Hong Kong Hockey Club Scraper...")
|
|
print("=" * 60)
|
|
|
|
scraper = ClubScraper()
|
|
|
|
print("\nFetching clubs from Hockey Hong Kong website...")
|
|
clubs = scraper.get_clubs_with_abbreviations()
|
|
|
|
if clubs:
|
|
print(f"\nFound {len(clubs)} clubs:")
|
|
for i, club in enumerate(clubs, 1):
|
|
print(f"\n{i}. {club['name']}")
|
|
if club.get('abbreviation'):
|
|
print(f" Abbreviation: {club['abbreviation']}")
|
|
if club.get('teams'):
|
|
print(f" Teams: {', '.join(club['teams'])}")
|
|
if club.get('convenor'):
|
|
print(f" Convenor: {club['convenor']}")
|
|
if club.get('email'):
|
|
print(f" Email: {club['email']}")
|
|
else:
|
|
print("\nNo clubs found. This might be due to website structure changes.")
|
|
print("Using fallback club list...")
|
|
|
|
# Fallback to known clubs
|
|
for abbreviation, full_name in scraper.CLUB_ABBREVIATIONS.items():
|
|
print(f"- {full_name} ({abbreviation})")
|
|
|
|
print("\n" + "=" * 60)
|