diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0f2eadf2..b9af7fda 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,5 @@ ### REPOSITORY -/.github/CODEOWNERS @sdushantha +/.github/CODEOWNERS @sdushantha @ppfeister /.github/FUNDING.yml @sdushantha /LICENSE @sdushantha diff --git a/.github/workflows/exclusions.yml b/.github/workflows/exclusions.yml new file mode 100644 index 00000000..6617ad67 --- /dev/null +++ b/.github/workflows/exclusions.yml @@ -0,0 +1,89 @@ +name: Exclusions Updater + +on: + schedule: + #- cron: '0 5 * * 0' # Runs at 05:00 every Sunday + - cron: '0 5 * * *' # Runs at 05:00 every day + workflow_dispatch: + +jobs: + update-exclusions: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - name: Install Poetry + uses: abatilo/actions-poetry@v4 + with: + poetry-version: 'latest' + + - name: Install dependencies + run: | + poetry install --no-interaction --with dev + + - name: Run false positive tests + run: | + $(poetry env activate) + pytest -q --tb no -m validate_targets_fp -n 20 | tee fp_test_results.txt + deactivate + + - name: Parse false positive detections by desired categories + run: | + grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was Claimed)' fp_test_results.txt \ + | sort -u > false_positive_exclusions.txt + grep -oP '(?<=test_false_pos\[)[^\]]+(?=\].*result was WAF)' fp_test_results.txt \ + | sort -u > waf_hits.txt + + - name: Detect if exclusions list changed + id: detect_changes + run: | + git fetch origin exclusions || true + + if git show origin/exclusions:false_positive_exclusions.txt >/dev/null 2>&1; then + # If the exclusions branch and file exist, compare + if git diff --quiet origin/exclusions -- false_positive_exclusions.txt; then + echo "exclusions_changed=false" >> "$GITHUB_OUTPUT" + else + echo "exclusions_changed=true" >> "$GITHUB_OUTPUT" + fi + else + # If the exclusions branch or file do not exist, treat as changed + echo "exclusions_changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: Quantify and display results + run: | + FP_COUNT=$(wc -l < false_positive_exclusions.txt | xargs) + WAF_COUNT=$(wc -l < waf_hits.txt | xargs) + echo ">>> Found $FP_COUNT false positives and $WAF_COUNT WAF hits." + echo ">>> False positive exclusions:" && cat false_positive_exclusions.txt + echo ">>> WAF hits:" && cat waf_hits.txt + + - name: Commit and push exclusions list + if: steps.detect_changes.outputs.exclusions_changed == 'true' + run: | + git config user.name "Paul Pfeister (automation)" + git config user.email "code@pfeister.dev" + + mv false_positive_exclusions.txt false_positive_exclusions.txt.tmp + + git add -f false_positive_exclusions.txt.tmp # -f required to override .gitignore + git stash push -m "stash false positive exclusion list" -- false_positive_exclusions.txt.tmp + + git fetch origin exclusions || true # Allows creation of branch if deleted + git checkout -B exclusions origin/exclusions || (git checkout --orphan exclusions && git rm -rf .) + + git stash pop || true + + mv false_positive_exclusions.txt.tmp false_positive_exclusions.txt + + git rm -f false_positive_exclusions.txt.tmp || true + git add false_positive_exclusions.txt + git commit -m "auto: update exclusions list" || echo "No changes to commit" + git push origin exclusions diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 2e5ea941..5029b870 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -11,6 +11,7 @@ on: - '**/*.py' - '**/*.ini' - '**/*.toml' + - 'Dockerfile' push: branches: - master @@ -21,11 +22,13 @@ on: - '**/*.py' - '**/*.ini' - '**/*.toml' + - 'Dockerfile' jobs: tox-lint: - # Linting is ran through tox to ensure that the same linter is used by local runners runs-on: ubuntu-latest + # Linting is ran through tox to ensure that the same linter + # is used by local runners steps: - uses: actions/checkout@v4 - name: Set up linting environment @@ -41,7 +44,8 @@ jobs: tox-matrix: runs-on: ${{ matrix.os }} strategy: - fail-fast: false # We want to know what specicic versions it fails on + # We want to know what specicic versions it fails on + fail-fast: false matrix: os: [ ubuntu-latest, @@ -49,10 +53,10 @@ jobs: macos-latest, ] python-version: [ - '3.9', '3.10', '3.11', '3.12', + '3.13', ] steps: - uses: actions/checkout@v4 @@ -67,3 +71,22 @@ jobs: pip install tox-gh-actions - name: Run tox run: tox + docker-build-test: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Get version from pyproject.toml + id: get-version + run: | + VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2) + echo "version=$VERSION" >> $GITHUB_OUTPUT + - name: Build Docker image + run: | + docker build \ + --build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \ + -t sherlock-test:latest . + - name: Test Docker image runs + run: docker run --rm sherlock-test:latest --version diff --git a/.github/workflows/validate_modified_targets.yml b/.github/workflows/validate_modified_targets.yml new file mode 100644 index 00000000..de024090 --- /dev/null +++ b/.github/workflows/validate_modified_targets.yml @@ -0,0 +1,100 @@ +name: Modified Target Validation + +on: + pull_request_target: + branches: + - master + paths: + - "sherlock_project/resources/data.json" + +jobs: + validate-modified-targets: + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Checkout repository + uses: actions/checkout@v5 + with: + ref: ${{ github.base_ref }} + fetch-depth: 1 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - name: Install Poetry + uses: abatilo/actions-poetry@v4 + with: + poetry-version: 'latest' + + - name: Install dependencies + run: | + poetry install --no-interaction --with dev + + - name: Drop in place updated manifest from base + run: | + cp sherlock_project/resources/data.json data.json.base + git fetch origin pull/${{ github.event.pull_request.number }}/head:pr --depth=1 + git show pr:sherlock_project/resources/data.json > sherlock_project/resources/data.json + cp sherlock_project/resources/data.json data.json.head + + - name: Discover modified targets + id: discover-modified + run: | + CHANGED=$( + python - <<'EOF' + import json + with open("data.json.base") as f: base = json.load(f) + with open("data.json.head") as f: head = json.load(f) + + changed = [] + for k, v in head.items(): + if k not in base or base[k] != v: + changed.append(k) + + print(",".join(sorted(changed))) + EOF + ) + + # Preserve changelist + echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')" + echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT" + + - name: Validate modified targets + if: steps.discover-modified.outputs.changed_targets != '' + continue-on-error: true + run: | + poetry run pytest -q --tb no -rA -m validate_targets -n 20 \ + --chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" \ + --junitxml=validation_results.xml + + - name: Prepare validation summary + if: steps.discover-modified.outputs.changed_targets != '' + id: prepare-summary + run: | + summary=$( + poetry run python devel/summarize_site_validation.py validation_results.xml || echo "Failed to generate summary of test results" + ) + echo "$summary" > validation_summary.md + + - name: Announce validation results + if: steps.discover-modified.outputs.changed_targets != '' + uses: actions/github-script@v8 + with: + script: | + const fs = require('fs'); + const body = fs.readFileSync('validation_summary.md', 'utf8'); + await github.rest.issues.createComment({ + issue_number: context.payload.pull_request.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body, + }); + + - name: This step shows as ran when no modifications are found + if: steps.discover-modified.outputs.changed_targets == '' + run: | + echo "No modified targets found" diff --git a/Dockerfile b/Dockerfile index 2e13f679..ccdfbf23 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,9 +2,9 @@ # 1. Update the version tag in the Dockerfile to match the version in sherlock/__init__.py # 2. Update the VCS_REF tag to match the tagged version's FULL commit hash # 3. Build image with BOTH latest and version tags - # i.e. `docker build -t sherlock/sherlock:0.15.0 -t sherlock/sherlock:latest .` + # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .` -FROM python:3.12-slim-bullseye as build +FROM python:3.12-slim-bullseye AS build WORKDIR /sherlock RUN pip3 install --no-cache-dir --upgrade pip diff --git a/devel/summarize_site_validation.py b/devel/summarize_site_validation.py new file mode 100644 index 00000000..89d39750 --- /dev/null +++ b/devel/summarize_site_validation.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python +# This module summarizes the results of site validation tests queued by +# workflow validate_modified_targets for presentation in Issue comments. + +from defusedxml import ElementTree as ET +import sys +from pathlib import Path + +def summarize_junit_xml(xml_path: Path) -> str: + tree = ET.parse(xml_path) + root = tree.getroot() + suite = root.find('testsuite') + + pass_message: str = ":heavy_check_mark:   Pass" + fail_message: str = ":x:   Fail" + + if suite is None: + raise ValueError("Invalid JUnit XML: No testsuite found") + + summary_lines: list[str] = [] + summary_lines.append("#### Automatic validation of changes\n") + summary_lines.append("| Target | F+ Check | F- Check |") + summary_lines.append("|---|---|---|") + + failures = int(suite.get('failures', 0)) + errors_detected: bool = False + + results: dict[str, dict[str, str]] = {} + + for testcase in suite.findall('testcase'): + test_name = testcase.get('name').split('[')[0] + site_name = testcase.get('name').split('[')[1].rstrip(']') + failure = testcase.find('failure') + error = testcase.find('error') + + if site_name not in results: + results[site_name] = {} + + if test_name == "test_false_neg": + results[site_name]['F- Check'] = pass_message if failure is None and error is None else fail_message + elif test_name == "test_false_pos": + results[site_name]['F+ Check'] = pass_message if failure is None and error is None else fail_message + + if error is not None: + errors_detected = True + + for result in results: + summary_lines.append(f"| {result} | {results[result].get('F+ Check', 'Error!')} | {results[result].get('F- Check', 'Error!')} |") + + if failures > 0: + summary_lines.append("\n___\n" + + "\nFailures were detected on at least one updated target. Commits containing accuracy failures" + + " will often not be merged (unless a rationale is provided, such as false negatives due to regional differences).") + + if errors_detected: + summary_lines.append("\n___\n" + + "\n**Errors were detected during validation. Please review the workflow logs.**") + + return "\n".join(summary_lines) + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: summarize_site_validation.py ") + sys.exit(1) + + xml_path: Path = Path(sys.argv[1]) + if not xml_path.is_file(): + print(f"Error: File '{xml_path}' does not exist.") + sys.exit(1) + + summary: str = summarize_junit_xml(xml_path) + print(summary) diff --git a/docs/README.md b/docs/README.md index afabfcf9..af901109 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ -

+


- + sherlock
Hunt down social media accounts by username across 400+ social networks
@@ -15,8 +15,7 @@

- - +demo

@@ -115,14 +114,14 @@ $ echo '{"usernames":["user123"]}' | apify call -so netmilk/sherlock }] ``` -Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmaticaly via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock). +Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmatically via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock). ## Credits Thank you to everyone who has contributed to Sherlock! ❤️ - + contributors ## Star history diff --git a/docs/removed-sites.md b/docs/removed-sites.md index ecf8631e..b44e520e 100644 --- a/docs/removed-sites.md +++ b/docs/removed-sites.md @@ -1982,3 +1982,16 @@ __2025-02-16 :__ Unsure if any way to view profiles exists now "username_claimed": "t3dotgg" } ``` + +## TorrentGalaxy +__2025-07-06 :__ Site appears to have gone offline in March and hasn't come back +```json + "TorrentGalaxy": { + "errorMsg": "TGx:Can't show details", + "errorType": "message", + "regexCheck": "^[A-Za-z0-9]{3,15}$", + "url": "https://torrentgalaxy.to/profile/{}", + "urlMain": "https://torrentgalaxy.to/", + "username_claimed": "GalaxyRG" + }, +``` diff --git a/pyproject.toml b/pyproject.toml index 069cb9d3..45dc683d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,8 +8,7 @@ source = "init" [tool.poetry] name = "sherlock-project" -# single source of truth for version is __init__.py -version = "0" +version = "0.16.0" description = "Hunt down social media accounts by username across social networks" license = "MIT" authors = [ @@ -47,15 +46,19 @@ PySocks = "^1.7.0" requests = "^2.22.0" requests-futures = "^1.0.0" stem = "^1.8.0" -torrequest = "^0.1.0" pandas = "^2.2.1" openpyxl = "^3.0.10" - -[tool.poetry.extras] -tor = ["torrequest"] +tomli = "^2.2.1" [tool.poetry.group.dev.dependencies] jsonschema = "^4.0.0" +rstr = "^3.2.2" +pytest = "^8.4.2" +pytest-xdist = "^3.8.0" + + +[tool.poetry.group.ci.dependencies] +defusedxml = "^0.7.1" [tool.poetry.scripts] sherlock = 'sherlock_project.sherlock:main' diff --git a/pytest.ini b/pytest.ini index bc1df7de..ce1af84e 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,7 @@ [pytest] -addopts = --strict-markers +addopts = --strict-markers -m "not validate_targets" markers = online: mark tests are requiring internet access. + validate_targets: mark tests for sweeping manifest validation (sends many requests). + validate_targets_fp: validate_targets, false positive tests only. + validate_targets_fn: validate_targets, false negative tests only. diff --git a/sherlock_project/__init__.py b/sherlock_project/__init__.py index 52307cd7..ad6c9e30 100644 --- a/sherlock_project/__init__.py +++ b/sherlock_project/__init__.py @@ -5,11 +5,26 @@ networks. """ +from importlib.metadata import version as pkg_version, PackageNotFoundError +import pathlib +import tomli + + +def get_version() -> str: + """Fetch the version number of the installed package.""" + try: + return pkg_version("sherlock_project") + except PackageNotFoundError: + pyproject_path: pathlib.Path = pathlib.Path(__file__).resolve().parent.parent / "pyproject.toml" + with pyproject_path.open("rb") as f: + pyproject_data = tomli.load(f) + return pyproject_data["tool"]["poetry"]["version"] + # This variable is only used to check for ImportErrors induced by users running as script rather than as module or package import_error_test_var = None __shortname__ = "Sherlock" __longname__ = "Sherlock: Find Usernames Across Social Networks" -__version__ = "0.15.0" +__version__ = get_version() forge_api_latest_release = "https://api.github.com/repos/sherlock-project/sherlock/releases/latest" diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 3ee6c343..9d5318af 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -79,13 +79,13 @@ "username_claimed": "pink" }, "AllMyLinks": { - "errorMsg": "Not Found", - "errorType": "message", - "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$", - "url": "https://allmylinks.com/{}", - "urlMain": "https://allmylinks.com/", - "username_claimed": "blue" - }, + "errorMsg": "Page not found", + "errorType": "message", + "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$", + "url": "https://allmylinks.com/{}", + "urlMain": "https://allmylinks.com/", + "username_claimed": "blue" +}, "AniWorld": { "errorMsg": "Dieses Profil ist nicht verf\u00fcgbar", "errorType": "message", @@ -115,12 +115,20 @@ "username_claimed": "lio24d" }, "Apple Discussions": { - "errorMsg": "The page you tried was not found. You may have used an outdated link or may have typed the address (URL) incorrectly.", + "errorMsg": "Looking for something in Apple Support Communities?", "errorType": "message", "url": "https://discussions.apple.com/profile/{}", "urlMain": "https://discussions.apple.com", "username_claimed": "jason" }, + "Aparat": { + "errorType": "status_code", + "request_method": "GET", + "url": "https://www.aparat.com/{}/", + "urlMain": "https://www.aparat.com/", + "urlProbe": "https://www.aparat.com/api/fa/v1/user/user/information/username/{}", + "username_claimed": "jadi" + }, "Archive of Our Own": { "errorType": "status_code", "regexCheck": "^[^.]*?$", @@ -250,6 +258,12 @@ "urlMain": "https://www.blipfoto.com/", "username_claimed": "blue" }, + "Blitz Tactics": { + "errorType": "status_code", + "url": "https://blitztactics.com/{}", + "urlMain": "https://blitztactics.com/", + "username_claimed": "Lance5500" + }, "Blogger": { "errorType": "status_code", "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", @@ -257,13 +271,12 @@ "urlMain": "https://www.blogger.com/", "username_claimed": "blue" }, - "BoardGameGeek": { - "errorType": "message", - "regexCheck": "^[a-zA-Z0-9_]*$", - "errorMsg": "User not found", - "url": "https://boardgamegeek.com/user/{}", - "urlMain": "https://boardgamegeek.com", - "username_claimed": "blue" + "Bluesky": { + "errorType": "status_code", + "url": "https://bsky.app/profile/{}.bsky.social", + "urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social", + "urlMain": "https://bsky.app/", + "username_claimed": "mcuban" }, "BongaCams": { "errorType": "status_code", @@ -278,6 +291,14 @@ "urlMain": "https://www.bookcrossing.com/", "username_claimed": "blue" }, + "BoardGameGeek": { + "errorMsg": "\"isValid\":true", + "errorType": "message", + "url": "https://boardgamegeek.com/user/{}", + "urlMain": "https://boardgamegeek.com/", + "urlProbe": "https://api.geekdo.com/api/accounts/validate/username?username={}", + "username_claimed": "blue" + }, "BraveCommunity": { "errorType": "status_code", "url": "https://community.brave.com/u/{}/", @@ -357,6 +378,12 @@ "urlMain": "https://career.habr.com/", "username_claimed": "blue" }, + "CashApp": { + "errorType": "status_code", + "url": "https://cash.app/${}", + "urlMain": "https://cash.app", + "username_claimed": "hotdiggitydog" + }, "Championat": { "errorType": "status_code", "url": "https://www.championat.com/user/{}", @@ -479,7 +506,8 @@ "username_claimed": "hacker" }, "Code Sandbox": { - "errorType": "status_code", + "errorType": "message", + "errorMsg": "Whoops, page not found", "url": "https://codesandbox.io/u/{}", "urlMain": "https://codesandbox.io", "username_claimed": "icyjoseph" @@ -551,8 +579,7 @@ "username_claimed": "brown" }, "CyberDefenders": { - "errorMsg": "Blue Team Training for SOC analysts and DFIR - CyberDefenders", - "errorType": "message", + "errorType": "status_code", "regexCheck": "^[^\\/:*?\"<>|@]{3,50}$", "request_method": "GET", "url": "https://cyberdefenders.org/p/{}", @@ -579,6 +606,12 @@ "urlMain": "https://www.dailymotion.com/", "username_claimed": "blue" }, + "dcinside": { + "errorType": "status_code", + "url": "https://gallog.dcinside.com/{}", + "urlMain": "https://www.dcinside.com/", + "username_claimed": "anrbrb" + }, "Dealabs": { "errorMsg": "La page que vous essayez", "errorType": "message", @@ -587,20 +620,21 @@ "urlMain": "https://www.dealabs.com/", "username_claimed": "blue" }, - "DeviantART": { - "errorType": "status_code", - "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", - "url": "https://{}.deviantart.com", - "urlMain": "https://deviantart.com", - "username_claimed": "blue" - }, + "DeviantArt": { + "errorType": "message", + "errorMsg": "Llama Not Found", + "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", + "url": "https://www.deviantart.com/{}", + "urlMain": "https://www.deviantart.com/", + "username_claimed": "blue" +}, "DigitalSpy": { - "errorMsg": "The page you were looking for could not be found.", - "errorType": "message", - "url": "https://forums.digitalspy.com/profile/{}", - "urlMain": "https://forums.digitalspy.com/", - "username_claimed": "blue", - "regexCheck": "^\\w{3,20}$" + "errorMsg": "The page you were looking for could not be found.", + "errorType": "message", + "url": "https://forums.digitalspy.com/profile/{}", + "urlMain": "https://forums.digitalspy.com/", + "username_claimed": "blue", + "regexCheck": "^\\w{3,20}$" }, "Discogs": { "errorType": "status_code", @@ -786,13 +820,12 @@ "urlMain": "https://fosstodon.org/", "username_claimed": "blue" }, - "Freelance.habr": { - "errorMsg": "
", - "errorType": "message", - "regexCheck": "^((?!\\.).)*$", - "url": "https://freelance.habr.com/freelancers/{}", - "urlMain": "https://freelance.habr.com/", - "username_claimed": "adam" + "Framapiaf": { + "errorType": "status_code", + "regexCheck": "^[a-zA-Z0-9_]{1,30}$", + "url": "https://framapiaf.org/@{}", + "urlMain": "https://framapiaf.org", + "username_claimed": "pylapp" }, "Freelancer": { "errorMsg": "\"users\":{}", @@ -1129,6 +1162,13 @@ "urlProbe": "https://imginn.com/{}", "username_claimed": "instagram" }, + "Instapaper": { + "errorType": "status_code", + "request_method": "GET", + "url": "https://www.instapaper.com/p/{}", + "urlMain": "https://www.instapaper.com/", + "username_claimed": "john" + }, "Instructables": { "errorType": "status_code", "url": "https://www.instructables.com/member/{}", @@ -1241,6 +1281,13 @@ "urlMain": "https://linux.org.ru/", "username_claimed": "red" }, + "Laracast": { + "errorType":"status_code", + "url": "https://laracasts.com/@{}", + "urlMain": "https://laracasts.com/", + "regexCheck": "^[a-zA-Z0-9_-]{3,}$", + "username_claimed": "user1" + }, "Launchpad": { "errorType": "status_code", "url": "https://launchpad.net/~{}", @@ -1298,6 +1345,12 @@ "urlMain": "https://linktr.ee/", "username_claimed": "anne" }, + "LinuxFR.org": { + "errorType": "status_code", + "url": "https://linuxfr.org/users/{}", + "urlMain": "https://linuxfr.org/", + "username_claimed": "pylapp" + }, "Listed": { "errorType": "response_url", "errorUrl": "https://listed.to/@{}", @@ -1338,6 +1391,13 @@ "urlMain": "https://forums.mmorpg.com/", "username_claimed": "goku" }, + "Mamot": { + "errorType": "status_code", + "regexCheck": "^[a-zA-Z0-9_]{1,30}$", + "url": "https://mamot.fr/@{}", + "urlMain": "https://mamot.fr/", + "username_claimed": "anciensEnssat" + }, "Medium": { "errorMsg": "Username available", + "errorMsg": "Page no longer exists", "url": "https://slideshare.net/{}", "urlMain": "https://slideshare.net/", "username_claimed": "blue" @@ -1865,6 +1978,13 @@ "urlMain": "https://www.snapchat.com", "username_claimed": "teamsnapchat" }, + "SOOP": { + "errorType": "status_code", + "url": "https://www.sooplive.co.kr/station/{}", + "urlMain": "https://www.sooplive.co.kr/", + "urlProbe": "https://api-channel.sooplive.co.kr/v1.1/channel/{}/station", + "username_claimed": "udkn" + }, "SoundCloud": { "errorType": "status_code", "url": "https://soundcloud.com/{}", @@ -1884,6 +2004,12 @@ "urlMain": "https://soylentnews.org", "username_claimed": "adam" }, + "SpeakerDeck": { + "errorType": "status_code", + "url": "https://speakerdeck.com/{}", + "urlMain": "https://speakerdeck.com/", + "username_claimed": "pylapp" + }, "Speedrun.com": { "errorType": "status_code", "url": "https://speedrun.com/users/{}", @@ -2025,6 +2151,12 @@ "urlMain": "https://themeforest.net/", "username_claimed": "user" }, + "tistory": { + "errorType": "status_code", + "url": "https://{}.tistory.com/", + "urlMain": "https://www.tistory.com/", + "username_claimed": "notice" + }, "TnAFlix": { "errorType": "status_code", "isNSFW": true, @@ -2032,14 +2164,6 @@ "urlMain": "https://www.tnaflix.com/", "username_claimed": "hacker" }, - "TorrentGalaxy": { - "errorMsg": "TGx:Can't show details", - "errorType": "message", - "regexCheck": "^[A-Za-z0-9]{3,15}$", - "url": "https://torrentgalaxy.to/profile/{}", - "urlMain": "https://torrentgalaxy.to/", - "username_claimed": "GalaxyRG" - }, "TradingView": { "errorType": "status_code", "request_method": "GET", @@ -2706,7 +2830,7 @@ "username_claimed": "green" }, "threads": { - "errorMsg": "Threads", + "errorMsg": "Threads • Log in", "errorType": "message", "headers": { "Sec-Fetch-Mode": "navigate" @@ -2721,12 +2845,24 @@ "urlMain": "https://www.toster.ru/", "username_claimed": "adam" }, + "tumblr": { + "errorType": "status_code", + "url": "https://{}.tumblr.com/", + "urlMain": "https://www.tumblr.com/", + "username_claimed": "goku" +}, "uid": { "errorType": "status_code", "url": "http://uid.me/{}", "urlMain": "https://uid.me/", "username_claimed": "blue" }, + "write.as": { + "errorType": "status_code", + "url": "https://write.as/{}", + "urlMain": "https://write.as", + "username_claimed": "pylapp" + }, "xHamster": { "errorType": "status_code", "isNSFW": true, @@ -2747,5 +2883,13 @@ "urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social", "urlMain": "https://bsky.app/", "username_claimed": "mcuban" + }, + "Platzi": { + "errorType": "status_code", + "errorCode": 404, + "url": "https://platzi.com/p/{}/", + "urlMain": "https://platzi.com/", + "username_claimed": "freddier", + "request_method": "GET" } } \ No newline at end of file diff --git a/sherlock_project/resources/data.schema.json b/sherlock_project/resources/data.schema.json index 216ffb62..c717cb25 100644 --- a/sherlock_project/resources/data.schema.json +++ b/sherlock_project/resources/data.schema.json @@ -1,80 +1,149 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "Sherlock Target Manifest", - "description": "Social media targets to probe for the existence of known usernames", - "type": "object", - "properties": { - "$schema": { "type": "string" } - }, - "patternProperties": { - "^(?!\\$).*?$": { - "type": "object", - "description": "Target name and associated information (key should be human readable name)", - "required": [ "url", "urlMain", "errorType", "username_claimed" ], - "properties": { - "url": { "type": "string" }, - "urlMain": { "type": "string" }, - "urlProbe": { "type": "string" }, - "username_claimed": { "type": "string" }, - "regexCheck": { "type": "string" }, - "isNSFW": { "type": "boolean" }, - "headers": { "type": "object" }, - "request_payload": { "type": "object" }, - "__comment__": { - "type": "string", - "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock." - }, - "tags": { - "oneOf": [ - { "$ref": "#/$defs/tag" }, - { "type": "array", "items": { "$ref": "#/$defs/tag" } } - ] - }, - "request_method": { - "type": "string", - "enum": [ "GET", "POST", "HEAD", "PUT" ] - }, + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Sherlock Target Manifest", + "description": "Social media targets to probe for the existence of known usernames", + "type": "object", + "properties": { + "$schema": { "type": "string" } + }, + "patternProperties": { + "^(?!\\$).*?$": { + "type": "object", + "description": "Target name and associated information (key should be human readable name)", + "required": ["url", "urlMain", "errorType", "username_claimed"], + "properties": { + "url": { "type": "string" }, + "urlMain": { "type": "string" }, + "urlProbe": { "type": "string" }, + "username_claimed": { "type": "string" }, + "regexCheck": { "type": "string" }, + "isNSFW": { "type": "boolean" }, + "headers": { "type": "object" }, + "request_payload": { "type": "object" }, + "__comment__": { + "type": "string", + "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock." + }, + "tags": { + "oneOf": [ + { "$ref": "#/$defs/tag" }, + { "type": "array", "items": { "$ref": "#/$defs/tag" } } + ] + }, + "request_method": { + "type": "string", + "enum": ["GET", "POST", "HEAD", "PUT"] + }, + "errorType": { + "oneOf": [ + { + "type": "string", + "enum": ["message", "response_url", "status_code"] + }, + { + "type": "array", + "items": { + "type": "string", + "enum": ["message", "response_url", "status_code"] + } + } + ] + }, + "errorMsg": { + "oneOf": [ + { "type": "string" }, + { "type": "array", "items": { "type": "string" } } + ] + }, + "errorCode": { + "oneOf": [ + { "type": "integer" }, + { "type": "array", "items": { "type": "integer" } } + ] + }, + "errorUrl": { "type": "string" }, + "response_url": { "type": "string" } + }, + "dependencies": { + "errorMsg": { + "oneOf": [ + { "properties": { "errorType": { "const": "message" } } }, + { + "properties": { "errorType": { - "type": "string", - "enum": [ "message", "response_url", "status_code" ] - }, - "errorMsg": { - "oneOf": [ - { "type": "string" }, - { "type": "array", "items": { "type": "string" } } - ] - }, - "errorCode": { - "oneOf": [ - { "type": "integer" }, - { "type": "array", "items": { "type": "integer" } } - ] - }, - "errorUrl": { "type": "string" }, - "response_url": { "type": "string" } - }, - "dependencies": { - "errorMsg": { - "properties" : { "errorType": { "const": "message" } } - }, - "errorUrl": { - "properties": { "errorType": { "const": "response_url" } } - }, - "errorCode": { - "properties": { "errorType": { "const": "status_code" } } + "type": "array", + "contains": { "const": "message" } } - }, - "if": { "properties": { "errorType": { "const": "message" } } }, - "then": { "required": [ "errorMsg" ] }, - "else": { - "if": { "properties": { "errorType": { "const": "response_url" } } }, - "then": { "required": [ "errorUrl" ] } - }, - "additionalProperties": false + } + } + ] + }, + "errorUrl": { + "oneOf": [ + { "properties": { "errorType": { "const": "response_url" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "response_url" } + } + } + } + ] + }, + "errorCode": { + "oneOf": [ + { "properties": { "errorType": { "const": "status_code" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "status_code" } + } + } + } + ] } - }, - "additionalProperties": false, - "$defs": { - "tag": { "type": "string", "enum": [ "adult", "gaming" ] } + }, + "allOf": [ + { + "if": { + "anyOf": [ + { "properties": { "errorType": { "const": "message" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "message" } + } + } + } + ] + }, + "then": { "required": ["errorMsg"] } + }, + { + "if": { + "anyOf": [ + { "properties": { "errorType": { "const": "response_url" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "response_url" } + } + } + } + ] + }, + "then": { "required": ["errorUrl"] } + } + ], + "additionalProperties": false } + }, + "additionalProperties": false, + "$defs": { + "tag": { "type": "string", "enum": ["adult", "gaming"] } + } } diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index 4e80d31c..75b3e3d7 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -169,14 +169,12 @@ def multiple_usernames(username): def sherlock( username: str, - site_data: dict, + site_data: dict[str, dict[str, str]], query_notify: QueryNotify, - tor: bool = False, - unique_tor: bool = False, dump_response: bool = False, proxy: Optional[str] = None, timeout: int = 60, -): +) -> dict[str, dict[str, str | QueryResult]]: """Run Sherlock Analysis. Checks for existence of username on various social media sites. @@ -188,8 +186,6 @@ def sherlock( query_notify -- Object with base type of QueryNotify(). This will be used to notify the caller about query results. - tor -- Boolean indicating whether to use a tor circuit for the requests. - unique_tor -- Boolean indicating whether to use a new tor circuit for each request. proxy -- String indicating the proxy URL timeout -- Time in seconds to wait before timing out request. Default is 60 seconds. @@ -210,32 +206,9 @@ def sherlock( # Notify caller that we are starting the query. query_notify.start(username) - # Create session based on request methodology - if tor or unique_tor: - try: - from torrequest import TorRequest # noqa: E402 - except ImportError: - print("Important!") - print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.") - print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.") - print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n") - sys.exit(query_notify.finish()) - print("Important!") - print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.") - - # Requests using Tor obfuscation - try: - underlying_request = TorRequest() - except OSError: - print("Tor not found in system path. Unable to continue.\n") - sys.exit(query_notify.finish()) - - underlying_session = underlying_request.session - else: - # Normal requests - underlying_session = requests.session() - underlying_request = requests.Request() + # Normal requests + underlying_session = requests.session() # Limit number of workers to 20. # This is probably vastly overkill. @@ -359,15 +332,10 @@ def sherlock( # Store future in data for access later net_info["request_future"] = future - # Reset identify for tor (if needed) - if unique_tor: - underlying_request.reset_identity() - # Add this site's results into final dictionary with all the other results. results_total[social_network] = results_site # Open the file containing account links - # Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses for social_network, net_info in site_data.items(): # Retrieve results again results_site = results_total.get(social_network) @@ -381,6 +349,8 @@ def sherlock( # Get the expected error type error_type = net_info["errorType"] + if isinstance(error_type, str): + error_type: list[str] = [error_type] # Retrieve future and ensure it has finished future = net_info["request_future"] @@ -425,58 +395,60 @@ def sherlock( elif any(hitMsg in r.text for hitMsg in WAFHitMsgs): query_status = QueryStatus.WAF - elif error_type == "message": - # error_flag True denotes no error found in the HTML - # error_flag False denotes error found in the HTML - error_flag = True - errors = net_info.get("errorMsg") - # errors will hold the error message - # it can be string or list - # by isinstance method we can detect that - # and handle the case for strings as normal procedure - # and if its list we can iterate the errors - if isinstance(errors, str): - # Checks if the error message is in the HTML - # if error is present we will set flag to False - if errors in r.text: - error_flag = False - else: - # If it's list, it will iterate all the error message - for error in errors: - if error in r.text: - error_flag = False - break - if error_flag: - query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE - elif error_type == "status_code": - error_codes = net_info.get("errorCode") - query_status = QueryStatus.CLAIMED - - # Type consistency, allowing for both singlets and lists in manifest - if isinstance(error_codes, int): - error_codes = [error_codes] - - if error_codes is not None and r.status_code in error_codes: - query_status = QueryStatus.AVAILABLE - elif r.status_code >= 300 or r.status_code < 200: - query_status = QueryStatus.AVAILABLE - elif error_type == "response_url": - # For this detection method, we have turned off the redirect. - # So, there is no need to check the response URL: it will always - # match the request. Instead, we will ensure that the response - # code indicates that the request was successful (i.e. no 404, or - # forward to some odd redirect). - if 200 <= r.status_code < 300: - query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE else: - # It should be impossible to ever get here... - raise ValueError( - f"Unknown Error Type '{error_type}' for " f"site '{social_network}'" - ) + if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type): + error_context = f"Unknown error type '{error_type}' for {social_network}" + query_status = QueryStatus.UNKNOWN + else: + if "message" in error_type: + # error_flag True denotes no error found in the HTML + # error_flag False denotes error found in the HTML + error_flag = True + errors = net_info.get("errorMsg") + # errors will hold the error message + # it can be string or list + # by isinstance method we can detect that + # and handle the case for strings as normal procedure + # and if its list we can iterate the errors + if isinstance(errors, str): + # Checks if the error message is in the HTML + # if error is present we will set flag to False + if errors in r.text: + error_flag = False + else: + # If it's list, it will iterate all the error message + for error in errors: + if error in r.text: + error_flag = False + break + if error_flag: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE + + if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE: + error_codes = net_info.get("errorCode") + query_status = QueryStatus.CLAIMED + + # Type consistency, allowing for both singlets and lists in manifest + if isinstance(error_codes, int): + error_codes = [error_codes] + + if error_codes is not None and r.status_code in error_codes: + query_status = QueryStatus.AVAILABLE + elif r.status_code >= 300 or r.status_code < 200: + query_status = QueryStatus.AVAILABLE + + if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE: + # For this detection method, we have turned off the redirect. + # So, there is no need to check the response URL: it will always + # match the request. Instead, we will ensure that the response + # code indicates that the request was successful (i.e. no 404, or + # forward to some odd redirect). + if 200 <= r.status_code < 300: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE if dump_response: print("+++++++++++++++++++++") @@ -507,7 +479,7 @@ def sherlock( print("+++++++++++++++++++++") # Notify caller about results of query. - result = QueryResult( + result: QueryResult = QueryResult( username=username, site_name=social_network, site_url_user=url, @@ -596,22 +568,6 @@ def main(): dest="output", help="If using single username, the output of the result will be saved to this file.", ) - parser.add_argument( - "--tor", - "-t", - action="store_true", - dest="tor", - default=False, - help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.", - ) - parser.add_argument( - "--unique-tor", - "-u", - action="store_true", - dest="unique_tor", - default=False, - help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.", - ) parser.add_argument( "--csv", action="store_true", @@ -719,12 +675,30 @@ def main(): help="Include checking of NSFW sites from default list.", ) + # TODO deprecated in favor of --txt, retained for workflow compatibility, to be removed + # in future release parser.add_argument( "--no-txt", action="store_true", dest="no_txt", default=False, - help="Disable creation of a txt file", + help="Disable creation of a txt file - WILL BE DEPRECATED", + ) + + parser.add_argument( + "--txt", + action="store_true", + dest="output_txt", + default=False, + help="Enable creation of a txt file", + ) + + parser.add_argument( + "--ignore-exclusions", + action="store_true", + dest="ignore_exclusions", + default=False, + help="Ignore upstream exclusions (may return more false positives)", ) args = parser.parse_args() @@ -734,7 +708,7 @@ def main(): # Check for newer version of Sherlock. If it exists, let the user know about it try: - latest_release_raw = requests.get(forge_api_latest_release).text + latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text latest_release_json = json_loads(latest_release_raw) latest_remote_tag = latest_release_json["tag_name"] @@ -747,22 +721,10 @@ def main(): except Exception as error: print(f"A problem occurred while checking for an update: {error}") - # Argument check - # TODO regex check on args.proxy - if args.tor and (args.proxy is not None): - raise Exception("Tor and Proxy cannot be set at the same time.") - # Make prompts if args.proxy is not None: print("Using the proxy: " + args.proxy) - if args.tor or args.unique_tor: - print("Using Tor to make requests") - - print( - "Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors." - ) - if args.no_color: # Disable color output. init(strip=True, convert=False) @@ -784,7 +746,8 @@ def main(): try: if args.local: sites = SitesInformation( - os.path.join(os.path.dirname(__file__), "resources/data.json") + os.path.join(os.path.dirname(__file__), "resources/data.json"), + honor_exclusions=False, ) else: json_file_location = args.json_file @@ -793,7 +756,7 @@ def main(): if args.json_file.isnumeric(): pull_number = args.json_file pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}" - pull_request_raw = requests.get(pull_url).text + pull_request_raw = requests.get(pull_url, timeout=10).text pull_request_json = json_loads(pull_request_raw) # Check if it's a valid pull request @@ -804,7 +767,11 @@ def main(): head_commit_sha = pull_request_json["head"]["sha"] json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json" - sites = SitesInformation(json_file_location) + sites = SitesInformation( + data_file_path=json_file_location, + honor_exclusions=not args.ignore_exclusions, + do_not_exclude=args.site_list, + ) except Exception as error: print(f"ERROR: {error}") sys.exit(1) @@ -858,8 +825,6 @@ def main(): username, site_data, query_notify, - tor=args.tor, - unique_tor=args.unique_tor, dump_response=args.dump_response, proxy=args.proxy, timeout=args.timeout, @@ -875,7 +840,7 @@ def main(): else: result_file = f"{username}.txt" - if not args.no_txt: + if args.output_txt: with open(result_file, "w", encoding="utf-8") as file: exists_counter = 0 for website_name in results: diff --git a/sherlock_project/sites.py b/sherlock_project/sites.py index 847d1576..b7aaf4c5 100644 --- a/sherlock_project/sites.py +++ b/sherlock_project/sites.py @@ -7,6 +7,10 @@ import json import requests import secrets + +MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" +EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt" + class SiteInformation: def __init__(self, name, url_home, url_username_format, username_claimed, information, is_nsfw, username_unclaimed=secrets.token_urlsafe(10)): @@ -67,12 +71,17 @@ class SiteInformation: Return Value: Nicely formatted string to get information about this object. """ - + return f"{self.name} ({self.url_home})" class SitesInformation: - def __init__(self, data_file_path=None): + def __init__( + self, + data_file_path: str|None = None, + honor_exclusions: bool = True, + do_not_exclude: list[str] = [], + ): """Create Sites Information Object. Contains information about all supported websites. @@ -110,7 +119,7 @@ class SitesInformation: # The default data file is the live data.json which is in the GitHub repo. The reason why we are using # this instead of the local one is so that the user has the most up-to-date data. This prevents # users from creating issue about false positives which has already been fixed or having outdated data - data_file_path = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json" + data_file_path = MANIFEST_URL # Ensure that specified data file has correct extension. if not data_file_path.lower().endswith(".json"): @@ -120,7 +129,7 @@ class SitesInformation: if data_file_path.lower().startswith("http"): # Reference is to a URL. try: - response = requests.get(url=data_file_path) + response = requests.get(url=data_file_path, timeout=30) except Exception as error: raise FileNotFoundError( f"Problem while attempting to access data file URL '{data_file_path}': {error}" @@ -152,9 +161,31 @@ class SitesInformation: raise FileNotFoundError(f"Problem while attempting to access " f"data file '{data_file_path}'." ) - + site_data.pop('$schema', None) + if honor_exclusions: + try: + response = requests.get(url=EXCLUSIONS_URL, timeout=10) + if response.status_code == 200: + exclusions = response.text.splitlines() + exclusions = [exclusion.strip() for exclusion in exclusions] + + for site in do_not_exclude: + if site in exclusions: + exclusions.remove(site) + + for exclusion in exclusions: + try: + site_data.pop(exclusion, None) + except KeyError: + pass + + except Exception: + # If there was any problem loading the exclusions, just continue without them + print("Warning: Could not load exclusions, continuing without them.") + honor_exclusions = False + self.sites = {} # Add all site information from the json file to internal site list. @@ -194,7 +225,7 @@ class SitesInformation: for site in self.sites: if self.sites[site].is_nsfw and site.casefold() not in do_not_remove: continue - sites[site] = self.sites[site] + sites[site] = self.sites[site] self.sites = sites def site_name_list(self): diff --git a/tests/conftest.py b/tests/conftest.py index 51c90814..69fce756 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,11 @@ import urllib import pytest from sherlock_project.sites import SitesInformation +def fetch_local_manifest(honor_exclusions: bool = True) -> dict[str, dict[str, str]]: + sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json"), honor_exclusions=honor_exclusions) + sites_iterable: dict[str, dict[str, str]] = {site.name: site.information for site in sites_obj} + return sites_iterable + @pytest.fixture() def sites_obj(): sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json")) @@ -11,9 +16,7 @@ def sites_obj(): @pytest.fixture(scope="session") def sites_info(): - sites_obj = SitesInformation(data_file_path=os.path.join(os.path.dirname(__file__), "../sherlock_project/resources/data.json")) - sites_iterable = {site.name: site.information for site in sites_obj} - yield sites_iterable + yield fetch_local_manifest() @pytest.fixture(scope="session") def remote_schema(): @@ -21,3 +24,28 @@ def remote_schema(): with urllib.request.urlopen(schema_url) as remoteschema: schemadat = json.load(remoteschema) yield schemadat + +def pytest_addoption(parser): + parser.addoption( + "--chunked-sites", + action="store", + default=None, + help="For tests utilizing chunked sites, include only the (comma-separated) site(s) specified.", + ) + +def pytest_generate_tests(metafunc): + if "chunked_sites" in metafunc.fixturenames: + sites_info = fetch_local_manifest(honor_exclusions=False) + + # Ingest and apply site selections + site_filter: str | None = metafunc.config.getoption("--chunked-sites") + if site_filter: + selected_sites: list[str] = [site.strip() for site in site_filter.split(",")] + sites_info = { + site: data for site, data in sites_info.items() + if site in selected_sites + } + + params = [{name: data} for name, data in sites_info.items()] + ids = list(sites_info.keys()) + metafunc.parametrize("chunked_sites", params, ids=ids) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 5c47fbb8..b73e9240 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -7,7 +7,7 @@ def test_validate_manifest_against_local_schema(): """Ensures that the manifest matches the local schema, for situations where the schema is being changed.""" json_relative: str = '../sherlock_project/resources/data.json' schema_relative: str = '../sherlock_project/resources/data.schema.json' - + json_path: str = os.path.join(os.path.dirname(__file__), json_relative) schema_path: str = os.path.join(os.path.dirname(__file__), schema_relative) diff --git a/tests/test_validate_targets.py b/tests/test_validate_targets.py new file mode 100644 index 00000000..4eb7ea16 --- /dev/null +++ b/tests/test_validate_targets.py @@ -0,0 +1,99 @@ +import pytest +import re +import rstr + +from sherlock_project.sherlock import sherlock +from sherlock_project.notify import QueryNotify +from sherlock_project.result import QueryResult, QueryStatus + + +FALSE_POSITIVE_ATTEMPTS: int = 2 # Since the usernames are randomly generated, it's POSSIBLE that a real username can be hit +FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND: int = 15 # If a pattern uses quantifiers such as `+` `*` or `{n,}`, limit the upper bound (0 to disable) +FALSE_POSITIVE_DEFAULT_PATTERN: str = r'^[a-zA-Z0-9]{7,20}$' # Used in absence of a regexCheck entry + + +def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND) -> str: + """Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`.""" + def replace_upper_bound(match: re.Match) -> str: # type: ignore + lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore + upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore # noqa: F823 + return f'{{{lower_bound},{upper_bound}}}' + + pattern = re.sub(r'(? QueryStatus: + """Check if a site is likely to produce false positives.""" + status: QueryStatus = QueryStatus.UNKNOWN + + for _ in range(FALSE_POSITIVE_ATTEMPTS): + query_notify: QueryNotify = QueryNotify() + username: str = rstr.xeger(pattern) + + result: QueryResult | str = sherlock( + username=username, + site_data=sites_info, + query_notify=query_notify, + )[site]['status'] + + if not hasattr(result, 'status'): + raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}") + if type(result.status) is not QueryStatus: # type: ignore + raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore + status = result.status # type: ignore + + if status in (QueryStatus.AVAILABLE, QueryStatus.WAF): + return status + + return status + + +def false_negative_check(sites_info: dict[str, dict[str, str]], site: str) -> QueryStatus: + """Check if a site is likely to produce false negatives.""" + status: QueryStatus = QueryStatus.UNKNOWN + query_notify: QueryNotify = QueryNotify() + + result: QueryResult | str = sherlock( + username=sites_info[site]['username_claimed'], + site_data=sites_info, + query_notify=query_notify, + )[site]['status'] + + if not hasattr(result, 'status'): + raise TypeError(f"Result for site {site} does not have 'status' attribute. Actual result: {result}") + if type(result.status) is not QueryStatus: # type: ignore + raise TypeError(f"Result status for site {site} is not of type QueryStatus. Actual type: {type(result.status)}") # type: ignore + status = result.status # type: ignore + + return status + +@pytest.mark.validate_targets +@pytest.mark.online +class Test_All_Targets: + + @pytest.mark.validate_targets_fp + def test_false_pos(self, chunked_sites: dict[str, dict[str, str]]): + """Iterate through all sites in the manifest to discover possible false-positive inducting targets.""" + pattern: str + for site in chunked_sites: + try: + pattern = chunked_sites[site]['regexCheck'] + except KeyError: + pattern = FALSE_POSITIVE_DEFAULT_PATTERN + + if FALSE_POSITIVE_QUANTIFIER_UPPER_BOUND > 0: + pattern = set_pattern_upper_bound(pattern) + + result: QueryStatus = false_positive_check(chunked_sites, site, pattern) + assert result is QueryStatus.AVAILABLE, f"{site} produced false positive with pattern {pattern}, result was {result}" + + @pytest.mark.validate_targets_fn + def test_false_neg(self, chunked_sites: dict[str, dict[str, str]]): + """Iterate through all sites in the manifest to discover possible false-negative inducting targets.""" + for site in chunked_sites: + result: QueryStatus = false_negative_check(chunked_sites, site) + assert result is QueryStatus.CLAIMED, f"{site} produced false negative, result was {result}" + diff --git a/tox.ini b/tox.ini index 1e9a47de..8c43ac30 100644 --- a/tox.ini +++ b/tox.ini @@ -7,8 +7,6 @@ envlist = py312 py311 py310 - py39 - py38 [testenv] description = Attempt to build and install the package @@ -16,6 +14,7 @@ deps = coverage jsonschema pytest + rstr allowlist_externals = coverage commands = coverage run --source=sherlock_project --module pytest -v @@ -37,7 +36,7 @@ commands = [gh-actions] python = + 3.13: py313 3.12: py312 3.11: py311 3.10: py310 - 3.9: py39