120 lines
4.7 KiB
Python
120 lines
4.7 KiB
Python
"""Tests for the cm_bot scraper resilience helpers.
|
|
|
|
The CM_BOT class currently uses bare `soup.find(...)['value']` calls
|
|
that throw cryptic TypeErrors when cm99.net returns an unexpected
|
|
response. R3 introduces three pieces:
|
|
- ScraperError: typed exception so callers can distinguish scraper
|
|
failures from network errors.
|
|
- _dump_html(context, content): writes the failing response to
|
|
logs/scraper-failures/<context>-<ts>.html and returns the path.
|
|
- _find_input_value(soup, ident, *, context, raw, by='name'):
|
|
the dominant extraction pattern. Returns the value on success,
|
|
dumps + raises ScraperError on miss.
|
|
|
|
These tests do NOT exercise the live cm99.net integration. They use
|
|
small inline HTML fixtures and patch filesystem side effects so the
|
|
tests stay hermetic.
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import unittest
|
|
from unittest import mock
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from app.cm_bot import CM_BOT, ScraperError
|
|
|
|
|
|
class ScraperHelpersTests(unittest.TestCase):
|
|
def setUp(self):
|
|
# CM_BOT.__init__ reads CM_BOT_BASE_URL from the env. Patch it
|
|
# in setUp (mock.patch.dict as a class decorator only wraps
|
|
# test_* methods, so setUp would see an unpatched env).
|
|
self._env_patcher = mock.patch.dict(
|
|
os.environ, {"CM_BOT_BASE_URL": "https://example.invalid"}
|
|
)
|
|
self._env_patcher.start()
|
|
self.addCleanup(self._env_patcher.stop)
|
|
|
|
# Each test gets a fresh tmpdir so the dump helper writes
|
|
# somewhere predictable. We chdir into it for the duration of
|
|
# the test because _dump_html writes to a relative
|
|
# logs/scraper-failures path.
|
|
self._old_cwd = os.getcwd()
|
|
self._tmp = tempfile.mkdtemp(prefix="r3-test-")
|
|
os.chdir(self._tmp)
|
|
self.bot = CM_BOT()
|
|
|
|
def tearDown(self):
|
|
os.chdir(self._old_cwd)
|
|
shutil.rmtree(self._tmp, ignore_errors=True)
|
|
|
|
# ---- _dump_html ----
|
|
|
|
def test_dump_html_creates_dir_and_writes_bytes(self):
|
|
path = self.bot._dump_html("ctx-test", b"<html>hi</html>")
|
|
self.assertTrue(os.path.isfile(path), f"file should exist: {path}")
|
|
with open(path, "rb") as f:
|
|
self.assertEqual(f.read(), b"<html>hi</html>")
|
|
self.assertTrue(path.startswith(os.path.join("logs", "scraper-failures")))
|
|
|
|
def test_dump_html_accepts_str_content(self):
|
|
path = self.bot._dump_html("ctx-test", "<html>hi</html>")
|
|
with open(path, "rb") as f:
|
|
self.assertEqual(f.read(), b"<html>hi</html>")
|
|
|
|
def test_dump_html_includes_context_and_timestamp_in_filename(self):
|
|
path = self.bot._dump_html("register_form_token", b"x")
|
|
basename = os.path.basename(path)
|
|
self.assertTrue(basename.startswith("register_form_token-"), basename)
|
|
self.assertTrue(basename.endswith(".html"), basename)
|
|
|
|
# ---- _find_input_value ----
|
|
|
|
def test_find_input_value_returns_value_when_present(self):
|
|
html = '<form><input name="token" value="abc123"></form>'
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
result = self.bot._find_input_value(
|
|
soup, "token", context="happy_path", raw=html.encode()
|
|
)
|
|
self.assertEqual(result, "abc123")
|
|
|
|
def test_find_input_value_raises_and_dumps_when_missing(self):
|
|
html = '<form><input name="other" value="x"></form>'
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
with self.assertRaises(ScraperError) as cm:
|
|
self.bot._find_input_value(
|
|
soup, "token", context="missing_input", raw=html.encode()
|
|
)
|
|
msg = str(cm.exception)
|
|
self.assertIn("missing_input", msg)
|
|
self.assertIn("token", msg)
|
|
dumped = os.listdir(os.path.join("logs", "scraper-failures"))
|
|
self.assertEqual(len(dumped), 1, f"expected one dump, got {dumped}")
|
|
self.assertTrue(dumped[0].startswith("missing_input-"))
|
|
|
|
def test_find_input_value_raises_when_input_has_no_value_attr(self):
|
|
html = '<form><input name="token"></form>'
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
with self.assertRaises(ScraperError):
|
|
self.bot._find_input_value(
|
|
soup, "token", context="no_value_attr", raw=html.encode()
|
|
)
|
|
|
|
def test_find_input_value_does_not_dump_on_success(self):
|
|
html = '<form><input name="token" value="abc"></form>'
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
self.bot._find_input_value(
|
|
soup, "token", context="should_not_dump", raw=html.encode()
|
|
)
|
|
self.assertFalse(
|
|
os.path.isdir(os.path.join("logs", "scraper-failures")),
|
|
"happy path should not create the failure dir",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|