article.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. """Articl class"""
  2. import unicodedata
  3. import re
  4. from bs4 import BeautifulSoup
  5. from . import MIDDLEWARE
  6. class Article(object):
  7. """Wrapper class for profile"""
  8. @staticmethod
  9. def info(article_id):
  10. """Get artcile"""
  11. path = 'news/show/{}'.format(article_id)
  12. response = MIDDLEWARE.get(path)
  13. soup = BeautifulSoup(response, 'html.parser')
  14. links = soup.select('.newspaper_links')
  15. newspaper = links[0]
  16. author = links[1]
  17. region = links[2]
  18. news_content = soup.select_one('.news_content')
  19. article_info = {
  20. 'article_id': article_id,
  21. 'article_title': unicodedata.normalize("NFKD", soup.select_one('.title_totr').text),
  22. 'newspaper_id': int(newspaper['action'].replace('newspaper/show/', '')),
  23. 'newspaper_name': newspaper.text,
  24. 'author_name': re.sub(r',\s\skarma.*$', '', author.text),
  25. 'author_id': int(author['action'].replace('slide/profile/', '')),
  26. 'region_name': region.text,
  27. 'region_id': int(region['action'].replace('map/details/', '')),
  28. 'content_text': news_content.text,
  29. 'content_html': news_content.prettify(),
  30. }
  31. result = re.search(r'.+(\s.+,)', soup.select_one('.tc.small').text)
  32. try:
  33. article_info['language'] = re.sub(r'\s|,', '', result[1].strip())
  34. except IndexError:
  35. pass
  36. return article_info