cm_bot_v2/tests/test_cm_bot_scraper.py

120 lines
4.7 KiB
Python

"""Tests for the cm_bot scraper resilience helpers.
The CM_BOT class currently uses bare `soup.find(...)['value']` calls
that throw cryptic TypeErrors when cm99.net returns an unexpected
response. R3 introduces three pieces:
- ScraperError: typed exception so callers can distinguish scraper
failures from network errors.
- _dump_html(context, content): writes the failing response to
logs/scraper-failures/<context>-<ts>.html and returns the path.
- _find_input_value(soup, ident, *, context, raw, by='name'):
the dominant extraction pattern. Returns the value on success,
dumps + raises ScraperError on miss.
These tests do NOT exercise the live cm99.net integration. They use
small inline HTML fixtures and patch filesystem side effects so the
tests stay hermetic.
"""
import os
import shutil
import tempfile
import unittest
from unittest import mock
from bs4 import BeautifulSoup
from app.cm_bot import CM_BOT, ScraperError
class ScraperHelpersTests(unittest.TestCase):
def setUp(self):
# CM_BOT.__init__ reads CM_BOT_BASE_URL from the env. Patch it
# in setUp (mock.patch.dict as a class decorator only wraps
# test_* methods, so setUp would see an unpatched env).
self._env_patcher = mock.patch.dict(
os.environ, {"CM_BOT_BASE_URL": "https://example.invalid"}
)
self._env_patcher.start()
self.addCleanup(self._env_patcher.stop)
# Each test gets a fresh tmpdir so the dump helper writes
# somewhere predictable. We chdir into it for the duration of
# the test because _dump_html writes to a relative
# logs/scraper-failures path.
self._old_cwd = os.getcwd()
self._tmp = tempfile.mkdtemp(prefix="r3-test-")
os.chdir(self._tmp)
self.bot = CM_BOT()
def tearDown(self):
os.chdir(self._old_cwd)
shutil.rmtree(self._tmp, ignore_errors=True)
# ---- _dump_html ----
def test_dump_html_creates_dir_and_writes_bytes(self):
path = self.bot._dump_html("ctx-test", b"<html>hi</html>")
self.assertTrue(os.path.isfile(path), f"file should exist: {path}")
with open(path, "rb") as f:
self.assertEqual(f.read(), b"<html>hi</html>")
self.assertTrue(path.startswith(os.path.join("logs", "scraper-failures")))
def test_dump_html_accepts_str_content(self):
path = self.bot._dump_html("ctx-test", "<html>hi</html>")
with open(path, "rb") as f:
self.assertEqual(f.read(), b"<html>hi</html>")
def test_dump_html_includes_context_and_timestamp_in_filename(self):
path = self.bot._dump_html("register_form_token", b"x")
basename = os.path.basename(path)
self.assertTrue(basename.startswith("register_form_token-"), basename)
self.assertTrue(basename.endswith(".html"), basename)
# ---- _find_input_value ----
def test_find_input_value_returns_value_when_present(self):
html = '<form><input name="token" value="abc123"></form>'
soup = BeautifulSoup(html, "html.parser")
result = self.bot._find_input_value(
soup, "token", context="happy_path", raw=html.encode()
)
self.assertEqual(result, "abc123")
def test_find_input_value_raises_and_dumps_when_missing(self):
html = '<form><input name="other" value="x"></form>'
soup = BeautifulSoup(html, "html.parser")
with self.assertRaises(ScraperError) as cm:
self.bot._find_input_value(
soup, "token", context="missing_input", raw=html.encode()
)
msg = str(cm.exception)
self.assertIn("missing_input", msg)
self.assertIn("token", msg)
dumped = os.listdir(os.path.join("logs", "scraper-failures"))
self.assertEqual(len(dumped), 1, f"expected one dump, got {dumped}")
self.assertTrue(dumped[0].startswith("missing_input-"))
def test_find_input_value_raises_when_input_has_no_value_attr(self):
html = '<form><input name="token"></form>'
soup = BeautifulSoup(html, "html.parser")
with self.assertRaises(ScraperError):
self.bot._find_input_value(
soup, "token", context="no_value_attr", raw=html.encode()
)
def test_find_input_value_does_not_dump_on_success(self):
html = '<form><input name="token" value="abc"></form>'
soup = BeautifulSoup(html, "html.parser")
self.bot._find_input_value(
soup, "token", context="should_not_dump", raw=html.encode()
)
self.assertFalse(
os.path.isdir(os.path.join("logs", "scraper-failures")),
"happy path should not create the failure dir",
)
if __name__ == "__main__":
unittest.main()