html_index.py 910 B

123456789101112131415161718192021222324252627282930313233343536373839
  1. """
  2. Parse html pages
  3. """
  4. from bs4 import BeautifulSoup
  5. REGIONS = {
  6. 'Northern Netherlands': None,
  7. 'Eastern Netherlands': None,
  8. 'Southern Netherlands': None,
  9. 'Western Netherlands': None,
  10. 'Amsterdam': None,
  11. 'Luxembourg': None,
  12. }
  13. def read():
  14. """Read data from HTML"""
  15. with open("index.html") as file:
  16. soup = BeautifulSoup(file, 'html.parser')
  17. return soup.find_all(class_='list_link')
  18. def parse_regios(regios_tree):
  19. """Parse raw data"""
  20. for regio_tree in regios_tree:
  21. name_tag = regio_tree.find(class_='list_name')
  22. name = name_tag.string
  23. index_tag = regio_tree.find(class_='list_level')
  24. index = index_tag.string
  25. buildings = index_tag['title'].replace('Hospital: ', '')
  26. print("%40s %2s %4s" % (name, index, buildings))
  27. if __name__ == '__main__':
  28. raw_data = read()
  29. data = parse_regios(raw_data)