9 Commits

Author SHA1 Message Date
Paul Pfeister d25848cc5f chore: remote waf fingerprinting base 2025-10-04 23:54:29 -04:00
Paul Pfeister f32f4ffaee Merge pull request #2595 from obiwan04kanobi/feature/issue-2196-ci-docker-build-test
Add Docker build test to CI workflow (#2196)
2025-10-04 21:09:04 -04:00
Paul Pfeister 7379ba7b19 Merge branch 'remove-tor' 2025-10-04 20:52:40 -04:00
Paul Pfeister 3aeb6d6356 Merge pull request #2602 from sherlock-project/feat/no-txt
chore: make default --no-txt
2025-10-04 20:36:33 -04:00
Paul Pfeister e44fe49c8f Merge pull request #2601 from sherlock-project/feat/graceful-skip
feat: gracefully skip sites with invalid errorType
2025-10-04 20:23:07 -04:00
obiwan04kanobi b99719ce60 Add Docker build test to CI workflow
- Adds docker-build-test job to regression.yml
- Runs on push/merge to master and release branches
- Extracts VERSION_TAG from pyproject.toml for build
- Tests that Docker image builds and runs successfully
- Resolves dockerfile syntax warnings
- Resolves #2196"
2025-10-05 00:22:12 +05:30
Paul Pfeister 2016892e64 Remove torrequest dep
Not sure why it's not in my patch file, but I was removing via sed in my spec instead.
2024-06-28 23:39:38 -04:00
Paul Pfeister 44ad8f506a Lint 2024-06-28 23:38:44 -04:00
Siddharth Dushantha cfa4097df9 removed support for tor 2024-06-26 21:57:11 +02:00
7 changed files with 246 additions and 71 deletions
+25 -2
View File
@@ -11,6 +11,7 @@ on:
- '**/*.py'
- '**/*.ini'
- '**/*.toml'
- 'Dockerfile'
push:
branches:
- master
@@ -21,11 +22,13 @@ on:
- '**/*.py'
- '**/*.ini'
- '**/*.toml'
- 'Dockerfile'
jobs:
tox-lint:
# Linting is ran through tox to ensure that the same linter is used by local runners
runs-on: ubuntu-latest
# Linting is ran through tox to ensure that the same linter
# is used by local runners
steps:
- uses: actions/checkout@v4
- name: Set up linting environment
@@ -41,7 +44,8 @@ jobs:
tox-matrix:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false # We want to know what specicic versions it fails on
# We want to know what specicic versions it fails on
fail-fast: false
matrix:
os: [
ubuntu-latest,
@@ -67,3 +71,22 @@ jobs:
pip install tox-gh-actions
- name: Run tox
run: tox
docker-build-test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Get version from pyproject.toml
id: get-version
run: |
VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2)
echo "version=$VERSION" >> $GITHUB_OUTPUT
- name: Build Docker image
run: |
docker build \
--build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \
-t sherlock-test:latest .
- name: Test Docker image runs
run: docker run --rm sherlock-test:latest --version
+1 -1
View File
@@ -4,7 +4,7 @@
# 3. Build image with BOTH latest and version tags
# i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`
FROM python:3.12-slim-bullseye as build
FROM python:3.12-slim-bullseye AS build
WORKDIR /sherlock
RUN pip3 install --no-cache-dir --upgrade pip
+1 -4
View File
@@ -46,13 +46,10 @@ PySocks = "^1.7.0"
requests = "^2.22.0"
requests-futures = "^1.0.0"
stem = "^1.8.0"
torrequest = "^0.1.0"
pandas = "^2.2.1"
openpyxl = "^3.0.10"
tomli = "^2.2.1"
[tool.poetry.extras]
tor = ["torrequest"]
pyyaml = "^6.0.3"
[tool.poetry.group.dev.dependencies]
jsonschema = "^4.0.0"
+2 -64
View File
@@ -171,8 +171,6 @@ def sherlock(
username: str,
site_data: dict[str, dict[str, str]],
query_notify: QueryNotify,
tor: bool = False,
unique_tor: bool = False,
dump_response: bool = False,
proxy: Optional[str] = None,
timeout: int = 60,
@@ -188,8 +186,6 @@ def sherlock(
query_notify -- Object with base type of QueryNotify().
This will be used to notify the caller about
query results.
tor -- Boolean indicating whether to use a tor circuit for the requests.
unique_tor -- Boolean indicating whether to use a new tor circuit for each request.
proxy -- String indicating the proxy URL
timeout -- Time in seconds to wait before timing out request.
Default is 60 seconds.
@@ -210,32 +206,9 @@ def sherlock(
# Notify caller that we are starting the query.
query_notify.start(username)
# Create session based on request methodology
if tor or unique_tor:
try:
from torrequest import TorRequest # noqa: E402
except ImportError:
print("Important!")
print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.")
print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n")
sys.exit(query_notify.finish())
print("Important!")
print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
# Requests using Tor obfuscation
try:
underlying_request = TorRequest()
except OSError:
print("Tor not found in system path. Unable to continue.\n")
sys.exit(query_notify.finish())
underlying_session = underlying_request.session
else:
# Normal requests
underlying_session = requests.session()
underlying_request = requests.Request()
# Normal requests
underlying_session = requests.session()
# Limit number of workers to 20.
# This is probably vastly overkill.
@@ -359,15 +332,10 @@ def sherlock(
# Store future in data for access later
net_info["request_future"] = future
# Reset identify for tor (if needed)
if unique_tor:
underlying_request.reset_identity()
# Add this site's results into final dictionary with all the other results.
results_total[social_network] = results_site
# Open the file containing account links
# Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
for social_network, net_info in site_data.items():
# Retrieve results again
results_site = results_total.get(social_network)
@@ -600,22 +568,6 @@ def main():
dest="output",
help="If using single username, the output of the result will be saved to this file.",
)
parser.add_argument(
"--tor",
"-t",
action="store_true",
dest="tor",
default=False,
help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.",
)
parser.add_argument(
"--unique-tor",
"-u",
action="store_true",
dest="unique_tor",
default=False,
help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.",
)
parser.add_argument(
"--csv",
action="store_true",
@@ -769,22 +721,10 @@ def main():
except Exception as error:
print(f"A problem occurred while checking for an update: {error}")
# Argument check
# TODO regex check on args.proxy
if args.tor and (args.proxy is not None):
raise Exception("Tor and Proxy cannot be set at the same time.")
# Make prompts
if args.proxy is not None:
print("Using the proxy: " + args.proxy)
if args.tor or args.unique_tor:
print("Using Tor to make requests")
print(
"Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors."
)
if args.no_color:
# Disable color output.
init(strip=True, convert=False)
@@ -885,8 +825,6 @@ def main():
username,
site_data,
query_notify,
tor=args.tor,
unique_tor=args.unique_tor,
dump_response=args.dump_response,
proxy=args.proxy,
timeout=args.timeout,
+84
View File
@@ -0,0 +1,84 @@
import requests
import yaml
NUCLEI_FINGERPRINT_URL: str = "https://raw.githubusercontent.com/projectdiscovery/nuclei-templates/refs/heads/main/http/global-matchers/global-waf-detect.yaml"
def _check_nuclei_regex(matcher: dict[str,str|list[str]], response: requests.Response) -> bool:
import re
and_cond: bool = matcher.get('condition', '') == 'and'
target_text: str
if matcher['part'] == 'body':
target_text = response.text
elif matcher['part'] == 'header':
target_text = str(response.headers)
else:
target_text = response.text + str(response.headers)
for regex in matcher['regex']:
if re.search(regex, target_text):
if not and_cond:
return True
else:
break
else:
# `and` conditions will cycle, resulting in this default return True
# unless an early failed detection breaks the loop (resulting in False)
return True
return False
def _check_nuclei_words(matcher: dict[str,str|list[str]], response: requests.Response) -> bool:
and_cond: bool = matcher.get('condition', '') == 'and'
target_text: str
if matcher['part'] == 'body':
target_text = response.text
elif matcher['part'] == 'header':
target_text = str(response.headers)
else:
target_text = response.text + str(response.headers)
for word in matcher['words']:
if word in target_text:
if not and_cond:
return True
else:
break
else:
# `and` conditions will cycle, resulting in this default return True
# unless an early failed detection breaks the loop (resulting in False)
return True
return False
def fetch_nuclei_fingerprints() -> list[dict[str,str|list[str]]] | None:
"""Fetch the latest Nuclei WAF fingerprints from the official repository."""
try:
response = requests.get(NUCLEI_FINGERPRINT_URL, timeout=10)
response.raise_for_status()
raw = yaml.safe_load(response.text)
fingerprints: list[dict[str,str|list[str]]] = raw['http'][0]['matchers']
return fingerprints
except requests.RequestException as e:
print(f"Error fetching Nuclei fingerprints: {e}")
return None
except yaml.YAMLError as e:
print(f"Error parsing YAML data: {e}")
return None
def nuclei_check(response: requests.Response, fingerprints: list[dict[str,str|list[str]]]) -> bool:
"""Check if the response matches any of the WAF fingerprints.
Keyword arguments:
response -- The HTTP response to check.
fingerprints -- The list of Nuclei WAF fingerprints to check against.
Returns True if a WAF is detected, False otherwise.
"""
for matcher in fingerprints:
if matcher['type'] == 'word':
return _check_nuclei_words(matcher, response)
elif matcher['type'] == 'regex':
return _check_nuclei_regex(matcher, response)
return False
+26
View File
@@ -0,0 +1,26 @@
id: global-waf-detect
http:
- global-matchers: true
matchers-condition: or
matchers:
- type: regex
name: regexSite
regex:
- '(?i)access.to.this.page.has.been.denied'
- '(?i)http(s)?://(www.)?anotheroneblocked.\w+.whywasiblocked'
condition: or
part: response
- type: word
name: wordSiteBody
part: body
words:
- "bad_text_in_body"
- type: word
name: wordSiteHead
part: header
condition: or
words:
- "text_in_head"
- "other_in_head"
+107
View File
@@ -0,0 +1,107 @@
import os
import unittest
from unittest.mock import patch, Mock
import requests
from requests.structures import CaseInsensitiveDict
import yaml
from sherlock_project import waf_check
TEMPLATE_BODY_PATH: str = os.path.join(os.path.dirname(__file__), 'mocks', 'global_waf_detect.yaml')
def side_effect(url, **kwargs) -> Mock:
if url == waf_check.NUCLEI_FINGERPRINT_URL:
with open(TEMPLATE_BODY_PATH, 'r', encoding='utf-8') as file:
template_body: str = file.read()
mock_response: Mock = Mock()
mock_response.status_code = 200
mock_response.text = template_body
return mock_response
raise RuntimeError("Unexpected URL")
class TestWafCheck(unittest.TestCase):
@patch('sherlock_project.waf_check.requests.get')
def test_fetch_nuclei_fingerprints(self, mock_requests_get): # type: ignore
mock_requests_get.side_effect = side_effect
result = waf_check.fetch_nuclei_fingerprints()
with open(TEMPLATE_BODY_PATH, 'r', encoding='utf-8') as file:
template_body: str = file.read()
expected: list[dict[str, str | list[str]]] = yaml.safe_load(template_body)['http'][0]['matchers']
self.assertEqual(result, expected)
def test_nuclei_regex_check(self):
mock_res: requests.Response = requests.Response()
mock_res.status_code = 200
mock_res._content = b"This is a test response with Test-Regex in the body."
mock_res.headers = CaseInsensitiveDict({
'Content-Type': 'text/html',
'Server': 'TestServer'
})
matcher: dict[str, str | list[str]] = {
'type': 'regex',
'name': 'test-regex',
'part': 'body',
'regex': [r'(?i)not-present'],
'condition': 'or'
}
self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['regex'] = [r'(?i)TeSt-REgEx']
self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['regex'] = [r'(?i)TeSt-REgEx', r'(?i)Not-Present']
self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['condition'] = 'and'
self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['part'] = 'header'
matcher['regex'] = [r'(?i)testserver']
self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['part'] = 'response'
self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['regex'] = [r'(?i)not-present']
self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
def test_nuclei_words_check(self):
mock_res: requests.Response = requests.Response()
mock_res.status_code = 200
mock_res._content = b"This is a test response with test-words in the body."
mock_res.headers = CaseInsensitiveDict({
'Content-Type': 'text/html',
'Server': 'TestServer'
})
matcher: dict[str, str | list[str]] = {
'type': 'word',
'name': 'test-word',
'part': 'body',
'words': ['not-present'],
'condition': 'or'
}
self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['words'] = ['test-word']
self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['words'] = ['test-word', 'Not-Present']
self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['condition'] = 'and'
self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['part'] = 'header'
matcher['words'] = ['testserver']
self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['words'] = ['TestServer']
self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
matcher['part'] = 'response'
self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]