__init__.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import argparse
  2. import html.parser
  3. import re
  4. import requests
  5. _TEXMED_URL_PATTERN = 'https://www.bioinformatics.org/texmed/cgi-bin' \
  6. '/list.cgi?PMID={pmid}&linkOut'
  7. class _TeXMedHtmlParser(html.parser.HTMLParser):
  8. def __init__(self):
  9. self.bibtex_entry = None
  10. super().__init__()
  11. @staticmethod
  12. def _strip_bibtex_entry(data: str) -> str:
  13. return re.sub(r'\n\% \d+\s?\n', '', data).strip() + '\n'
  14. def handle_data(self, data: str) -> None:
  15. if 'Author' in data:
  16. self.bibtex_entry = self._strip_bibtex_entry(data)
  17. def bibtex_entry_from_pmid(pmid: str) -> str:
  18. assert pmid.isdigit(), pmid
  19. resp = requests.get(_TEXMED_URL_PATTERN.format(pmid=pmid))
  20. resp.raise_for_status()
  21. parser = _TeXMedHtmlParser()
  22. parser.feed(resp.text)
  23. return parser.bibtex_entry
  24. def main():
  25. argparser = argparse.ArgumentParser()
  26. argparser.add_argument('pmid')
  27. args = argparser.parse_args()
  28. print(bibtex_entry_from_pmid(pmid=args.pmid),
  29. end='')
  30. if __name__ == '__main__':
  31. main()