Эх сурвалжийг харах

Working on html parser for index

JoostSijm 6 жил өмнө
parent
commit
f117e4537b
1 өөрчлөгдсөн 39 нэмэгдсэн , 0 устгасан
  1. 39 0
      html_index.py

+ 39 - 0
html_index.py

@@ -0,0 +1,39 @@
+
+"""
+Parse html pages
+"""
+
+from bs4 import BeautifulSoup
+
+
+REGIONS = {
+    'Northern Netherlands': None,
+    'Eastern Netherlands': None,
+    'Southern Netherlands': None,
+    'Western Netherlands': None,
+    'Amsterdam': None,
+    'Luxembourg': None,
+}
+
+
+def read():
+    """Read data from HTML"""
+    with open("index.html") as file:
+        soup = BeautifulSoup(file, 'html.parser')
+        return soup.find_all(class_='list_link')
+
+
+def parse_regios(regios_tree):
+    """Parse raw data"""
+    for regio_tree in regios_tree:
+        name_tag = regio_tree.find(class_='list_name')
+        name = name_tag.string
+        index_tag = regio_tree.find(class_='list_level')
+        index = index_tag.string
+        buildings = index_tag['title'].replace('Hospital: ', '')
+        print("%40s %2s %4s" % (name, index, buildings))
+
+
+if __name__ == '__main__':
+    raw_data = read()
+    data = parse_regios(raw_data)