diff --git a/app/cm_bot.py b/app/cm_bot.py
index ee315a5..5a4f619 100644
--- a/app/cm_bot.py
+++ b/app/cm_bot.py
@@ -3,8 +3,15 @@ import requests, re
from bs4 import BeautifulSoup
import os
-# with open('security_response.html', 'wb') as f:
-# f.write(response.content)
+
+class ScraperError(Exception):
+ """A cm99.net response did not contain the field we expected.
+
+ The raw response is saved to logs/scraper-failures/ before this is
+ raised; the message identifies which method failed and what was
+ being looked for.
+ """
+
class CM_BOT:
def __init__(self):
@@ -202,6 +209,41 @@ class CM_BOT:
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36'
}
+ def _dump_html(self, context: str, content) -> str:
+ """Save a failing cm99.net response to logs/scraper-failures/.
+
+ Returns the path written to so callers can include it in error
+ messages.
+ """
+ ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+ out_dir = os.path.join("logs", "scraper-failures")
+ os.makedirs(out_dir, exist_ok=True)
+ path = os.path.join(out_dir, f"{context}-{ts}.html")
+ if isinstance(content, (bytes, bytearray)):
+ data = bytes(content)
+ else:
+ data = str(content).encode("utf-8", "replace")
+ with open(path, "wb") as f:
+ f.write(data)
+ print(f"[scraper-failure] dumped {context} response to {path}")
+ return path
+
+ def _find_input_value(self, soup, ident: str, *, context: str, raw, by: str = "name") -> str:
+ """Extract 's value or raise ScraperError.
+
+ `by` selects between matching (default) and
+ . Saves the raw response to logs/scraper-failures/
+ before raising so the operator can postmortem.
+ """
+ el = soup.find("input", {by: ident})
+ if el is None or "value" not in el.attrs:
+ path = self._dump_html(context, raw)
+ raise ScraperError(
+ f"{context}: input[{by}={ident!r}] missing or has no value attribute "
+ f"(response saved to {path})"
+ )
+ return el["value"]
+
def get_register_data(self, token: str, username: str, password: str):
return {
'struts.token.name': 'token',
diff --git a/tests/test_cm_bot_scraper.py b/tests/test_cm_bot_scraper.py
new file mode 100644
index 0000000..c51519f
--- /dev/null
+++ b/tests/test_cm_bot_scraper.py
@@ -0,0 +1,119 @@
+"""Tests for the cm_bot scraper resilience helpers.
+
+The CM_BOT class currently uses bare `soup.find(...)['value']` calls
+that throw cryptic TypeErrors when cm99.net returns an unexpected
+response. R3 introduces three pieces:
+ - ScraperError: typed exception so callers can distinguish scraper
+ failures from network errors.
+ - _dump_html(context, content): writes the failing response to
+ logs/scraper-failures/-.html and returns the path.
+ - _find_input_value(soup, ident, *, context, raw, by='name'):
+ the dominant extraction pattern. Returns the value on success,
+ dumps + raises ScraperError on miss.
+
+These tests do NOT exercise the live cm99.net integration. They use
+small inline HTML fixtures and patch filesystem side effects so the
+tests stay hermetic.
+"""
+
+import os
+import shutil
+import tempfile
+import unittest
+from unittest import mock
+
+from bs4 import BeautifulSoup
+
+from app.cm_bot import CM_BOT, ScraperError
+
+
+class ScraperHelpersTests(unittest.TestCase):
+ def setUp(self):
+ # CM_BOT.__init__ reads CM_BOT_BASE_URL from the env. Patch it
+ # in setUp (mock.patch.dict as a class decorator only wraps
+ # test_* methods, so setUp would see an unpatched env).
+ self._env_patcher = mock.patch.dict(
+ os.environ, {"CM_BOT_BASE_URL": "https://example.invalid"}
+ )
+ self._env_patcher.start()
+ self.addCleanup(self._env_patcher.stop)
+
+ # Each test gets a fresh tmpdir so the dump helper writes
+ # somewhere predictable. We chdir into it for the duration of
+ # the test because _dump_html writes to a relative
+ # logs/scraper-failures path.
+ self._old_cwd = os.getcwd()
+ self._tmp = tempfile.mkdtemp(prefix="r3-test-")
+ os.chdir(self._tmp)
+ self.bot = CM_BOT()
+
+ def tearDown(self):
+ os.chdir(self._old_cwd)
+ shutil.rmtree(self._tmp, ignore_errors=True)
+
+ # ---- _dump_html ----
+
+ def test_dump_html_creates_dir_and_writes_bytes(self):
+ path = self.bot._dump_html("ctx-test", b"hi")
+ self.assertTrue(os.path.isfile(path), f"file should exist: {path}")
+ with open(path, "rb") as f:
+ self.assertEqual(f.read(), b"hi")
+ self.assertTrue(path.startswith(os.path.join("logs", "scraper-failures")))
+
+ def test_dump_html_accepts_str_content(self):
+ path = self.bot._dump_html("ctx-test", "hi")
+ with open(path, "rb") as f:
+ self.assertEqual(f.read(), b"hi")
+
+ def test_dump_html_includes_context_and_timestamp_in_filename(self):
+ path = self.bot._dump_html("register_form_token", b"x")
+ basename = os.path.basename(path)
+ self.assertTrue(basename.startswith("register_form_token-"), basename)
+ self.assertTrue(basename.endswith(".html"), basename)
+
+ # ---- _find_input_value ----
+
+ def test_find_input_value_returns_value_when_present(self):
+ html = ''
+ soup = BeautifulSoup(html, "html.parser")
+ result = self.bot._find_input_value(
+ soup, "token", context="happy_path", raw=html.encode()
+ )
+ self.assertEqual(result, "abc123")
+
+ def test_find_input_value_raises_and_dumps_when_missing(self):
+ html = ''
+ soup = BeautifulSoup(html, "html.parser")
+ with self.assertRaises(ScraperError) as cm:
+ self.bot._find_input_value(
+ soup, "token", context="missing_input", raw=html.encode()
+ )
+ msg = str(cm.exception)
+ self.assertIn("missing_input", msg)
+ self.assertIn("token", msg)
+ dumped = os.listdir(os.path.join("logs", "scraper-failures"))
+ self.assertEqual(len(dumped), 1, f"expected one dump, got {dumped}")
+ self.assertTrue(dumped[0].startswith("missing_input-"))
+
+ def test_find_input_value_raises_when_input_has_no_value_attr(self):
+ html = ''
+ soup = BeautifulSoup(html, "html.parser")
+ with self.assertRaises(ScraperError):
+ self.bot._find_input_value(
+ soup, "token", context="no_value_attr", raw=html.encode()
+ )
+
+ def test_find_input_value_does_not_dump_on_success(self):
+ html = ''
+ soup = BeautifulSoup(html, "html.parser")
+ self.bot._find_input_value(
+ soup, "token", context="should_not_dump", raw=html.encode()
+ )
+ self.assertFalse(
+ os.path.isdir(os.path.join("logs", "scraper-failures")),
+ "happy path should not create the failure dir",
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()