diff --git a/app/cm_bot.py b/app/cm_bot.py index ee315a5..5a4f619 100644 --- a/app/cm_bot.py +++ b/app/cm_bot.py @@ -3,8 +3,15 @@ import requests, re from bs4 import BeautifulSoup import os -# with open('security_response.html', 'wb') as f: -# f.write(response.content) + +class ScraperError(Exception): + """A cm99.net response did not contain the field we expected. + + The raw response is saved to logs/scraper-failures/ before this is + raised; the message identifies which method failed and what was + being looked for. + """ + class CM_BOT: def __init__(self): @@ -202,6 +209,41 @@ class CM_BOT: 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36' } + def _dump_html(self, context: str, content) -> str: + """Save a failing cm99.net response to logs/scraper-failures/. + + Returns the path written to so callers can include it in error + messages. + """ + ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + out_dir = os.path.join("logs", "scraper-failures") + os.makedirs(out_dir, exist_ok=True) + path = os.path.join(out_dir, f"{context}-{ts}.html") + if isinstance(content, (bytes, bytearray)): + data = bytes(content) + else: + data = str(content).encode("utf-8", "replace") + with open(path, "wb") as f: + f.write(data) + print(f"[scraper-failure] dumped {context} response to {path}") + return path + + def _find_input_value(self, soup, ident: str, *, context: str, raw, by: str = "name") -> str: + """Extract 's value or raise ScraperError. + + `by` selects between matching (default) and + . Saves the raw response to logs/scraper-failures/ + before raising so the operator can postmortem. + """ + el = soup.find("input", {by: ident}) + if el is None or "value" not in el.attrs: + path = self._dump_html(context, raw) + raise ScraperError( + f"{context}: input[{by}={ident!r}] missing or has no value attribute " + f"(response saved to {path})" + ) + return el["value"] + def get_register_data(self, token: str, username: str, password: str): return { 'struts.token.name': 'token', diff --git a/tests/test_cm_bot_scraper.py b/tests/test_cm_bot_scraper.py new file mode 100644 index 0000000..c51519f --- /dev/null +++ b/tests/test_cm_bot_scraper.py @@ -0,0 +1,119 @@ +"""Tests for the cm_bot scraper resilience helpers. + +The CM_BOT class currently uses bare `soup.find(...)['value']` calls +that throw cryptic TypeErrors when cm99.net returns an unexpected +response. R3 introduces three pieces: + - ScraperError: typed exception so callers can distinguish scraper + failures from network errors. + - _dump_html(context, content): writes the failing response to + logs/scraper-failures/-.html and returns the path. + - _find_input_value(soup, ident, *, context, raw, by='name'): + the dominant extraction pattern. Returns the value on success, + dumps + raises ScraperError on miss. + +These tests do NOT exercise the live cm99.net integration. They use +small inline HTML fixtures and patch filesystem side effects so the +tests stay hermetic. +""" + +import os +import shutil +import tempfile +import unittest +from unittest import mock + +from bs4 import BeautifulSoup + +from app.cm_bot import CM_BOT, ScraperError + + +class ScraperHelpersTests(unittest.TestCase): + def setUp(self): + # CM_BOT.__init__ reads CM_BOT_BASE_URL from the env. Patch it + # in setUp (mock.patch.dict as a class decorator only wraps + # test_* methods, so setUp would see an unpatched env). + self._env_patcher = mock.patch.dict( + os.environ, {"CM_BOT_BASE_URL": "https://example.invalid"} + ) + self._env_patcher.start() + self.addCleanup(self._env_patcher.stop) + + # Each test gets a fresh tmpdir so the dump helper writes + # somewhere predictable. We chdir into it for the duration of + # the test because _dump_html writes to a relative + # logs/scraper-failures path. + self._old_cwd = os.getcwd() + self._tmp = tempfile.mkdtemp(prefix="r3-test-") + os.chdir(self._tmp) + self.bot = CM_BOT() + + def tearDown(self): + os.chdir(self._old_cwd) + shutil.rmtree(self._tmp, ignore_errors=True) + + # ---- _dump_html ---- + + def test_dump_html_creates_dir_and_writes_bytes(self): + path = self.bot._dump_html("ctx-test", b"hi") + self.assertTrue(os.path.isfile(path), f"file should exist: {path}") + with open(path, "rb") as f: + self.assertEqual(f.read(), b"hi") + self.assertTrue(path.startswith(os.path.join("logs", "scraper-failures"))) + + def test_dump_html_accepts_str_content(self): + path = self.bot._dump_html("ctx-test", "hi") + with open(path, "rb") as f: + self.assertEqual(f.read(), b"hi") + + def test_dump_html_includes_context_and_timestamp_in_filename(self): + path = self.bot._dump_html("register_form_token", b"x") + basename = os.path.basename(path) + self.assertTrue(basename.startswith("register_form_token-"), basename) + self.assertTrue(basename.endswith(".html"), basename) + + # ---- _find_input_value ---- + + def test_find_input_value_returns_value_when_present(self): + html = '
' + soup = BeautifulSoup(html, "html.parser") + result = self.bot._find_input_value( + soup, "token", context="happy_path", raw=html.encode() + ) + self.assertEqual(result, "abc123") + + def test_find_input_value_raises_and_dumps_when_missing(self): + html = '
' + soup = BeautifulSoup(html, "html.parser") + with self.assertRaises(ScraperError) as cm: + self.bot._find_input_value( + soup, "token", context="missing_input", raw=html.encode() + ) + msg = str(cm.exception) + self.assertIn("missing_input", msg) + self.assertIn("token", msg) + dumped = os.listdir(os.path.join("logs", "scraper-failures")) + self.assertEqual(len(dumped), 1, f"expected one dump, got {dumped}") + self.assertTrue(dumped[0].startswith("missing_input-")) + + def test_find_input_value_raises_when_input_has_no_value_attr(self): + html = '
' + soup = BeautifulSoup(html, "html.parser") + with self.assertRaises(ScraperError): + self.bot._find_input_value( + soup, "token", context="no_value_attr", raw=html.encode() + ) + + def test_find_input_value_does_not_dump_on_success(self): + html = '
' + soup = BeautifulSoup(html, "html.parser") + self.bot._find_input_value( + soup, "token", context="should_not_dump", raw=html.encode() + ) + self.assertFalse( + os.path.isdir(os.path.join("logs", "scraper-failures")), + "happy path should not create the failure dir", + ) + + +if __name__ == "__main__": + unittest.main()