Merge pull request #2919 from quan-nguyen-2110/fix-cracked-forum-false-positive

Fix Cracked Forum false positives
Merge pull request #2921 from quan-nguyen-2110/fix-akniga-false-negative
2026-05-04 23:28:52 -04:00 · 2026-05-04 23:28:14 -04:00 · 2026-05-04 23:23:43 -04:00 · 2026-05-04 23:12:52 -04:00 · 2026-05-04 23:07:07 -04:00 · 2026-05-02 09:46:59 +02:00
15 changed files with 909 additions and 379 deletions
@@ -65,7 +65,7 @@ The Actor provides three types of outputs:
 | Field | Type | Required | Description |
 |-------|------|----------|-------------|
 | `username` | string | Yes | Username the search was conducted for |
-| `links` | arrray | Yes | Array with found links to the social media |
+| `links` | array | Yes | Array with found links to the social media |
 | `links[]`| string | No | URL to the account
 ### Example Dataset Item (JSON)
@@ -11,6 +11,7 @@ on:
      - '**/*.py'
      - '**/*.ini'
      - '**/*.toml'
      - 'Dockerfile'
  push:
    branches:
      - master
@@ -21,15 +22,17 @@ on:
      - '**/*.py'
      - '**/*.ini'
      - '**/*.toml'
      - 'Dockerfile'
 jobs:
  tox-lint:
    # Linting is ran through tox to ensure that the same linter is used by local runners
    runs-on: ubuntu-latest
    # Linting is run through tox to ensure that the same linter
    # is used by local runners
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
      - name: Set up linting environment
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: '3.x'
      - name: Install tox and related dependencies
@@ -41,7 +44,8 @@ jobs:
  tox-matrix:
    runs-on: ${{ matrix.os }}
    strategy:
-      fail-fast: false # We want to know what specicic versions it fails on
+      # We want to know what specific versions it fails on
      fail-fast: false
      matrix:
        os: [
          ubuntu-latest,
@@ -53,11 +57,13 @@ jobs:
          '3.11',
          '3.12',
          '3.13',
          '3.14',
          '3.14t',
        ]
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
      - name: Set up environment ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: ${{ matrix.python-version }}
      - name: Install tox and related dependencies
@@ -67,3 +73,22 @@ jobs:
          pip install tox-gh-actions
      - name: Run tox
        run: tox
  docker-build-test:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v6
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Get version from pyproject.toml
        id: get-version
        run: |
          VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2)
          echo "version=$VERSION" >> $GITHUB_OUTPUT
      - name: Build Docker image
        run: |
          docker build \
            --build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \
            -t sherlock-test:latest .
      - name: Test Docker image runs
        run: docker run --rm sherlock-test:latest --version
@@ -17,29 +17,41 @@ jobs:
      - name: Checkout repository
        uses: actions/checkout@v5
        with:
          # Checkout the base branch but fetch all history to avoid a second fetch call
          ref: ${{ github.base_ref }}
-          fetch-depth: 1
+          fetch-depth: 0
          persist-credentials: false
      - name: Set up Python
        uses: actions/setup-python@v6
        with:
-          python-version: '3.13'
+          python-version: "3.13"
      - name: Install Poetry
        uses: abatilo/actions-poetry@v4
        with:
-          poetry-version: 'latest'
+          poetry-version: "latest"
      - name: Install dependencies
        run: |
          poetry install --no-interaction --with dev
-      - name: Drop in place updated manifest from base
+      - name: Prepare JSON versions for comparison
        run: |
-          cp sherlock_project/resources/data.json data.json.base
+          # Fetch only the PR's branch head (single network call in this step)
-          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr --depth=1
+          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr
-          git show pr:sherlock_project/resources/data.json > sherlock_project/resources/data.json
+
-          cp sherlock_project/resources/data.json data.json.head
+          # Find the merge-base commit between the target branch and the PR branch
          MERGE_BASE=$(git merge-base origin/${{ github.base_ref }} pr)
          echo "Comparing PR head against merge-base commit: $MERGE_BASE"
          # Safely extract the file from the PR's head and the merge-base commit
          git show pr:sherlock_project/resources/data.json > data.json.head
          git show $MERGE_BASE:sherlock_project/resources/data.json > data.json.base
          # CRITICAL FIX: Overwrite the checked-out data.json with the one from the PR
          # This ensures that pytest runs against the new, updated file.
          cp data.json.head sherlock_project/resources/data.json
      - name: Discover modified targets
        id: discover-modified
@@ -47,8 +59,16 @@ jobs:
          CHANGED=$(
            python - <<'EOF'
          import json
-          with open("data.json.base") as f: base = json.load(f)
+          import sys
-          with open("data.json.head") as f: head = json.load(f)
+          try:
              with open("data.json.base") as f: base = json.load(f)
              with open("data.json.head") as f: head = json.load(f)
          except FileNotFoundError as e:
              print(f"Error: Could not find {e.filename}", file=sys.stderr)
              sys.exit(1)
          except json.JSONDecodeError as e:
              print(f"Error: Could not decode JSON from a file - {e}", file=sys.stderr)
              sys.exit(1)
          changed = []
          for k, v in head.items():
@@ -63,12 +83,19 @@ jobs:
          echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')"
          echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT"
-      - name: Validate modified targets
+      - name: Validate remote manifest against local schema
        if: steps.discover-modified.outputs.changed_targets != ''
-        continue-on-error: true
+        run: |
          poetry run pytest tests/test_manifest.py::test_validate_manifest_against_local_schema
      # --- The rest of the steps below are unchanged ---
      - name: Validate modified targets
        env:
          CHANGED_TARGETS: ${{ steps.discover-modified.outputs.changed_targets }}
        run: |
          poetry run pytest -q --tb no -rA -m validate_targets -n 20 \
-            --chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" \
+            --chunked-sites "$CHANGED_TARGETS" \
            --junitxml=validation_results.xml
      - name: Prepare validation summary
@@ -4,7 +4,7 @@
  # 3. Build image with BOTH latest and version tags
    # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`
-FROM python:3.12-slim-bullseye as build
+FROM python:3.12-slim-bullseye AS build
 WORKDIR /sherlock
 RUN pip3 install --no-cache-dir --upgrade pip
@@ -1,39 +1,45 @@
 #!/usr/bin/env python
 # This module generates the listing of supported sites which can be found in
-# sites.md. It also organizes all the sites in alphanumeric order
+# sites.mdx. It also organizes all the sites in alphanumeric order
 import json
 import os
 DATA_REL_URI: str = "sherlock_project/resources/data.json"
 DEFAULT_ENCODING = "utf-8"
 # Read the data.json file
-with open(DATA_REL_URI, "r", encoding="utf-8") as data_file:
+with open(DATA_REL_URI, "r", encoding=DEFAULT_ENCODING) as data_file:
    data: dict = json.load(data_file)
 # Removes schema-specific keywords for proper processing
-social_networks: dict = dict(data)
+social_networks = data.copy()
 social_networks.pop('$schema', None)
 # Sort the social networks in alphanumeric order
-social_networks: list = sorted(social_networks.items())
+social_networks = sorted(social_networks.items())
 # Make output dir where the site list will be written
 os.mkdir("output")
-# Write the list of supported sites to sites.md
+# Write the list of supported sites to sites.mdx
-with open("output/sites.mdx", "w") as site_file:
+with open("output/sites.mdx", "w", encoding=DEFAULT_ENCODING) as site_file:
-    site_file.write("---\ntitle: 'List of supported sites'\nsidebarTitle: 'Supported sites'\nicon: 'globe'\ndescription: 'Sherlock currently supports **400+** sites'\n---\n\n")
+    site_file.write("---\n")
    site_file.write("title: 'List of supported sites'\n")
    site_file.write("sidebarTitle: 'Supported sites'\n")
    site_file.write("icon: 'globe'\n")
    site_file.write("description: 'Sherlock currently supports **400+** sites'\n")
    site_file.write("---\n\n")
    for social_network, info in social_networks:
        url_main = info["urlMain"]
        is_nsfw = "**(NSFW)**" if info.get("isNSFW") else ""
        site_file.write(f"1. [{social_network}]({url_main}) {is_nsfw}\n")
 # Overwrite the data.json file with sorted data
-with open(DATA_REL_URI, "w") as data_file:
+with open(DATA_REL_URI, "w", encoding=DEFAULT_ENCODING) as data_file:
    sorted_data = json.dumps(data, indent=2, sort_keys=True)
    data_file.write(sorted_data)
-    data_file.write("\n")
+    data_file.write("\n")  # Keep the newline after writing data
 print("Finished updating supported site listing!")
@@ -23,17 +23,17 @@
 > [!WARNING]  
 > Packages for ParrotOS and Ubuntu 24.04, maintained by a third party, appear to be __broken__.  
-> Users of these systems should defer to pipx/pip or Docker.
+> Users of these systems should defer to [`uv`](https://docs.astral.sh/uv/)/`pipx`/`pip` or Docker.
 | Method | Notes |
 | - | - |
-| `pipx install sherlock-project` | `pip` may be used in place of `pipx` |
+| `pipx install sherlock-project` | `pip` or [`uv`](https://docs.astral.sh/uv/) may be used in place of `pipx` |
 | `docker run -it --rm sherlock/sherlock` |
 | `dnf install sherlock-project` | |
 Community-maintained packages are available for Debian (>= 13), Ubuntu (>= 22.10), Homebrew, Kali, and BlackArch. These packages are not directly supported or maintained by the Sherlock Project.
-See all alternative installation methods [here](https://sherlockproject.xyz/installation)
+See all alternative installation methods [here](https://sherlockproject.xyz/installation).
 ## General usage
@@ -51,70 +51,42 @@ Accounts found will be stored in an individual text file with the corresponding
 ```console
 $ sherlock --help
-usage: sherlock [-h] [--version] [--verbose] [--folderoutput FOLDEROUTPUT]
+usage: sherlock [-h] [--version] [--verbose] [--folderoutput FOLDEROUTPUT] [--output OUTPUT] [--csv] [--xlsx] [--site SITE_NAME] [--proxy PROXY_URL] [--dump-response]
-                [--output OUTPUT] [--tor] [--unique-tor] [--csv] [--xlsx]
+                [--json JSON_FILE] [--timeout TIMEOUT] [--print-all] [--print-found] [--no-color] [--browse] [--local] [--nsfw] [--txt] [--ignore-exclusions]
                [--site SITE_NAME] [--proxy PROXY_URL] [--json JSON_FILE]
                [--timeout TIMEOUT] [--print-all] [--print-found] [--no-color]
                [--browse] [--local] [--nsfw]
                USERNAMES [USERNAMES ...]
-Sherlock: Find Usernames Across Social Networks (Version 0.14.3)
+Sherlock: Find Usernames Across Social Networks (Version 0.16.0)
 positional arguments:
-  USERNAMES             One or more usernames to check with social networks.
+  USERNAMES             One or more usernames to check with social networks. Check similar usernames using {?} (replace to '_', '-', '.').
                        Check similar usernames using {?} (replace to '_', '-', '.').
-optional arguments:
+options:
  -h, --help            show this help message and exit
  --version             Display version information and dependencies.
  --verbose, -v, -d, --debug
                        Display extra debugging information and metrics.
  --folderoutput FOLDEROUTPUT, -fo FOLDEROUTPUT
-                        If using multiple usernames, the output of the results will be
+                        If using multiple usernames, the output of the results will be saved to this folder.
                        saved to this folder.
  --output OUTPUT, -o OUTPUT
-                        If using single username, the output of the result will be saved
+                        If using single username, the output of the result will be saved to this file.
                        to this file.
  --tor, -t             Make requests over Tor; increases runtime; requires Tor to be
                        installed and in system path.
  --unique-tor, -u      Make requests over Tor with new Tor circuit after each request;
                        increases runtime; requires Tor to be installed and in system
                        path.
  --csv                 Create Comma-Separated Values (CSV) File.
-  --xlsx                Create the standard file for the modern Microsoft Excel
+  --xlsx                Create the standard file for the modern Microsoft Excel spreadsheet (xlsx).
-                        spreadsheet (xlsx).
+  --site SITE_NAME      Limit analysis to just the listed sites. Add multiple options to specify more than one site.
  --site SITE_NAME      Limit analysis to just the listed sites. Add multiple options to
                        specify more than one site.
  --proxy PROXY_URL, -p PROXY_URL
                        Make requests over a proxy. e.g. socks5://127.0.0.1:1080
  --dump-response       Dump the HTTP response to stdout for targeted debugging.
  --json JSON_FILE, -j JSON_FILE
-                        Load data from a JSON file or an online, valid, JSON file.
+                        Load data from a JSON file or an online, valid, JSON file. Upstream PR numbers also accepted.
  --timeout TIMEOUT     Time (in seconds) to wait for response to requests (Default: 60)
  --print-all           Output sites where the username was not found.
-  --print-found         Output sites where the username was found.
+  --print-found         Output sites where the username was found (also if exported as file).
  --no-color            Don't color terminal output
  --browse, -b          Browse to all results on default browser.
  --local, -l           Force the use of the local data.json file.
  --nsfw                Include checking of NSFW sites from default list.
  --txt                 Enable creation of a txt file
  --ignore-exclusions   Ignore upstream exclusions (may return more false positives)
 ```
 ## Apify Actor Usage [![Sherlock Actor](https://apify.com/actor-badge?actor=netmilk/sherlock)](https://apify.com/netmilk/sherlock?fpr=sherlock)
 <a href="https://apify.com/netmilk/sherlock?fpr=sherlock"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Sherlock Actor on Apify" width="176" height="39" /></a>
 You can run Sherlock in the cloud without installation using the [Sherlock Actor](https://apify.com/netmilk/sherlock?fpr=sherlock) on [Apify](https://apify.com?fpr=sherlock) free of charge.
 ``` bash
 $ echo '{"usernames":["user123"]}' | apify call -so netmilk/sherlock
 [{
  "username": "user123",
  "links": [
    "https://www.1337x.to/user/user123/",
    ...
  ]
 }]
 ```
 Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmatically via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
 ## Credits
@@ -124,7 +96,7 @@ Thank you to everyone who has contributed to Sherlock! ❤️
  <img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" alt="contributors"/>
 </a>
-## Star history
+## Star History
 <picture>
  <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=sherlock-project/sherlock&type=Date&theme=dark" />
@@ -135,7 +107,7 @@ Thank you to everyone who has contributed to Sherlock! ❤️
 ## License
 MIT © Sherlock Project<br/>
-Original Creator - [Siddharth Dushantha](https://github.com/sdushantha)
+Creator - [Siddharth Dushantha](https://github.com/sdushantha)
 <!-- Reference Links -->
@@ -8,7 +8,7 @@ source = "init"
 [tool.poetry]
 name = "sherlock-project"
-version = "0.16.0"
+version = "0.16.1"
 description = "Hunt down social media accounts by username across social networks"
 license = "MIT"
 authors = [
@@ -29,6 +29,10 @@ classifiers = [
    "Natural Language :: English",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Topic :: Security"
 ]
 homepage = "https://sherlockproject.xyz/"
@@ -46,14 +50,10 @@ PySocks = "^1.7.0"
 requests = "^2.22.0"
 requests-futures = "^1.0.0"
 stem = "^1.8.0"
 torrequest = "^0.1.0"
 pandas = "^2.2.1"
 openpyxl = "^3.0.10"
 tomli = "^2.2.1"
 [tool.poetry.extras]
 tor = ["torrequest"]
 [tool.poetry.group.dev.dependencies]
 jsonschema = "^4.0.0"
 rstr = "^3.2.2"
@@ -37,7 +37,6 @@ class QueryNotify:
        self.result = result
        # return
    def start(self, message=None):
        """Notify Start.
@@ -56,7 +55,6 @@ class QueryNotify:
        Nothing.
        """
        # return
    def update(self, result):
        """Notify Update.
@@ -75,7 +73,6 @@ class QueryNotify:
        self.result = result
        # return
    def finish(self, message=None):
        """Notify Finish.
@@ -94,7 +91,6 @@ class QueryNotify:
        Nothing.
        """
        # return
    def __str__(self):
        """Convert Object To String.
@@ -137,7 +133,6 @@ class QueryNotifyPrint(QueryNotify):
        self.print_all = print_all
        self.browse = browse
        return
    def start(self, message):
        """Notify Start.
@@ -163,7 +158,6 @@ class QueryNotifyPrint(QueryNotify):
        # An empty line between first line and the result(more clear output)
        print('\r')
        return
    def countResults(self):
        """This function counts the number of results. Every time the function is called,
@@ -238,7 +232,7 @@ class QueryNotifyPrint(QueryNotify):
                      Fore.WHITE + "]" +
                      Fore.GREEN + f" {self.result.site_name}:" +
                      Fore.YELLOW + f" {msg}")
-                
+
        elif result.status == QueryStatus.WAF:
            if self.print_all:
                print(Style.BRIGHT + Fore.WHITE + "[" +
@@ -254,10 +248,9 @@ class QueryNotifyPrint(QueryNotify):
                f"Unknown Query Status '{result.status}' for site '{self.result.site_name}'"
            )
        return
    def finish(self, message="The processing has been finished."):
-        """Notify Start.
+        """Notify Finish.
        Will print the last line to the standard output.
        Keyword Arguments:
        self                   -- This object.
@@ -1,80 +1,149 @@
 {
-    "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
-    "title": "Sherlock Target Manifest",
+  "title": "Sherlock Target Manifest",
-    "description": "Social media targets to probe for the existence of known usernames",
+  "description": "Social media targets to probe for the existence of known usernames",
-    "type": "object",
+  "type": "object",
-    "properties": {
+  "properties": {
-        "$schema": { "type": "string" }
+    "$schema": { "type": "string" }
-    },
+  },
-    "patternProperties": {
+  "patternProperties": {
-        "^(?!\\$).*?$": {
+    "^(?!\\$).*?$": {
-            "type": "object",
+      "type": "object",
-            "description": "Target name and associated information (key should be human readable name)",
+      "description": "Target name and associated information (key should be human readable name)",
-            "required": [ "url", "urlMain", "errorType", "username_claimed" ],
+      "required": ["url", "urlMain", "errorType", "username_claimed"],
-            "properties": {
+      "properties": {
-                "url": { "type": "string" },
+        "url": { "type": "string" },
-                "urlMain": { "type": "string" },
+        "urlMain": { "type": "string" },
-                "urlProbe": { "type": "string" },
+        "urlProbe": { "type": "string" },
-                "username_claimed": { "type": "string" },
+        "username_claimed": { "type": "string" },
-                "regexCheck": { "type": "string" },
+        "regexCheck": { "type": "string" },
-                "isNSFW": { "type": "boolean" },
+        "isNSFW": { "type": "boolean" },
-                "headers": { "type": "object" },
+        "headers": { "type": "object" },
-                "request_payload": { "type": "object" },
+        "request_payload": { "type": "object" },
-                "__comment__": {
+        "__comment__": {
-                    "type": "string",
+          "type": "string",
-                    "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
+          "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
-                },
+        },
-                "tags": {
+        "tags": {
-                    "oneOf": [
+          "oneOf": [
-                        { "$ref": "#/$defs/tag" },
+            { "$ref": "#/$defs/tag" },
-                        { "type": "array", "items": { "$ref": "#/$defs/tag" } }
+            { "type": "array", "items": { "$ref": "#/$defs/tag" } }
-                    ]
+          ]
-                },
+        },
-                "request_method": {
+        "request_method": {
-                    "type": "string",
+          "type": "string",
-                    "enum": [ "GET", "POST", "HEAD", "PUT" ]
+          "enum": ["GET", "POST", "HEAD", "PUT"]
-                },
+        },
        "errorType": {
          "oneOf": [
            {
              "type": "string",
              "enum": ["message", "response_url", "status_code"]
            },
            {
              "type": "array",
              "items": {
                "type": "string",
                "enum": ["message", "response_url", "status_code"]
              }
            }
          ]
        },
        "errorMsg": {
          "oneOf": [
            { "type": "string" },
            { "type": "array", "items": { "type": "string" } }
          ]
        },
        "errorCode": {
          "oneOf": [
            { "type": "integer" },
            { "type": "array", "items": { "type": "integer" } }
          ]
        },
        "errorUrl": { "type": "string" },
        "response_url": { "type": "string" }
      },
      "dependencies": {
        "errorMsg": {
          "oneOf": [
            { "properties": { "errorType": { "const": "message" } } },
            {
              "properties": {
                "errorType": {
-                    "type": "string",
+                  "type": "array",
-                    "enum": [ "message", "response_url", "status_code" ]
+                  "contains": { "const": "message" }
                },
                "errorMsg": {
                    "oneOf": [
                        { "type": "string" },
                        { "type": "array", "items": { "type": "string" } }
                    ]
                },
                "errorCode": {
                    "oneOf": [
                        { "type": "integer" },
                        { "type": "array", "items": { "type": "integer" } }
                    ]
                },
                "errorUrl": { "type": "string" },
                "response_url": { "type": "string" }
            },
            "dependencies": {
                "errorMsg": {
                    "properties" : { "errorType": { "const": "message" } }
                },
                "errorUrl": {
                    "properties": { "errorType": { "const": "response_url" } }
                },
                "errorCode": {
                    "properties": { "errorType": { "const": "status_code" } }
                }
-            },
+              }
-            "if": { "properties": { "errorType": { "const": "message" } } },
+            }
-            "then": { "required": [ "errorMsg" ] },
+          ]
-            "else": {
+        },
-                "if": { "properties": { "errorType": { "const": "response_url" } } },
+        "errorUrl": {
-                "then": { "required": [ "errorUrl" ] }
+          "oneOf": [
-            },
+            { "properties": { "errorType": { "const": "response_url" } } },
-            "additionalProperties": false
+            {
              "properties": {
                "errorType": {
                  "type": "array",
                  "contains": { "const": "response_url" }
                }
              }
            }
          ]
        },
        "errorCode": {
          "oneOf": [
            { "properties": { "errorType": { "const": "status_code" } } },
            {
              "properties": {
                "errorType": {
                  "type": "array",
                  "contains": { "const": "status_code" }
                }
              }
            }
          ]
        }
-    },
+      },
-    "additionalProperties": false,
+      "allOf": [
-    "$defs": {
+        {
-        "tag": { "type": "string", "enum": [ "adult", "gaming" ] }
+          "if": {
            "anyOf": [
              { "properties": { "errorType": { "const": "message" } } },
              {
                "properties": {
                  "errorType": {
                    "type": "array",
                    "contains": { "const": "message" }
                  }
                }
              }
            ]
          },
          "then": { "required": ["errorMsg"] }
        },
        {
          "if": {
            "anyOf": [
              { "properties": { "errorType": { "const": "response_url" } } },
              {
                "properties": {
                  "errorType": {
                    "type": "array",
                    "contains": { "const": "response_url" }
                  }
                }
              }
            ]
          },
          "then": { "required": ["errorUrl"] }
        }
      ],
      "additionalProperties": false
    }
  },
  "additionalProperties": false,
  "$defs": {
    "tag": { "type": "string", "enum": ["adult", "gaming"] }
  }
 }
@@ -136,6 +136,9 @@ def get_response(request_future, error_type, social_network):
    except requests.exceptions.RequestException as err:
        error_context = "Unknown Error"
        exception_text = str(err)
    except UnicodeError as err:
        error_context = "Encoding Error"
        exception_text = str(err)
    return response, error_context, exception_text
@@ -171,8 +174,6 @@ def sherlock(
    username: str,
    site_data: dict[str, dict[str, str]],
    query_notify: QueryNotify,
    tor: bool = False,
    unique_tor: bool = False,
    dump_response: bool = False,
    proxy: Optional[str] = None,
    timeout: int = 60,
@@ -188,8 +189,6 @@ def sherlock(
    query_notify           -- Object with base type of QueryNotify().
                              This will be used to notify the caller about
                              query results.
    tor                    -- Boolean indicating whether to use a tor circuit for the requests.
    unique_tor             -- Boolean indicating whether to use a new tor circuit for each request.
    proxy                  -- String indicating the proxy URL
    timeout                -- Time in seconds to wait before timing out request.
                              Default is 60 seconds.
@@ -210,32 +209,9 @@ def sherlock(
    # Notify caller that we are starting the query.
    query_notify.start(username)
    # Create session based on request methodology
    if tor or unique_tor:
        try:
            from torrequest import TorRequest  # noqa: E402
        except ImportError:
            print("Important!")
            print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
            print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.")
            print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n")
            sys.exit(query_notify.finish())
-        print("Important!")
+    # Normal requests
-        print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
+    underlying_session = requests.session()
        # Requests using Tor obfuscation
        try:
            underlying_request = TorRequest()
        except OSError:
            print("Tor not found in system path. Unable to continue.\n")
            sys.exit(query_notify.finish())
        underlying_session = underlying_request.session
    else:
        # Normal requests
        underlying_session = requests.session()
        underlying_request = requests.Request()
    # Limit number of workers to 20.
    # This is probably vastly overkill.
@@ -359,15 +335,10 @@ def sherlock(
            # Store future in data for access later
            net_info["request_future"] = future
            # Reset identify for tor (if needed)
            if unique_tor:
                underlying_request.reset_identity()
        # Add this site's results into final dictionary with all the other results.
        results_total[social_network] = results_site
    # Open the file containing account links
    # Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
    for social_network, net_info in site_data.items():
        # Retrieve results again
        results_site = results_total.get(social_network)
@@ -381,6 +352,8 @@ def sherlock(
        # Get the expected error type
        error_type = net_info["errorType"]
        if isinstance(error_type, str):
            error_type: list[str] = [error_type]
        # Retrieve future and ensure it has finished
        future = net_info["request_future"]
@@ -425,58 +398,60 @@ def sherlock(
        elif any(hitMsg in r.text for hitMsg in WAFHitMsgs):
            query_status = QueryStatus.WAF
        elif error_type == "message":
            # error_flag True denotes no error found in the HTML
            # error_flag False denotes error found in the HTML
            error_flag = True
            errors = net_info.get("errorMsg")
            # errors will hold the error message
            # it can be string or list
            # by isinstance method we can detect that
            # and handle the case for strings as normal procedure
            # and if its list we can iterate the errors
            if isinstance(errors, str):
                # Checks if the error message is in the HTML
                # if error is present we will set flag to False
                if errors in r.text:
                    error_flag = False
            else:
                # If it's list, it will iterate all the error message
                for error in errors:
                    if error in r.text:
                        error_flag = False
                        break
            if error_flag:
                query_status = QueryStatus.CLAIMED
            else:
                query_status = QueryStatus.AVAILABLE
        elif error_type == "status_code":
            error_codes = net_info.get("errorCode")
            query_status = QueryStatus.CLAIMED
            # Type consistency, allowing for both singlets and lists in manifest
            if isinstance(error_codes, int):
                error_codes = [error_codes]
            if error_codes is not None and r.status_code in error_codes:
                query_status = QueryStatus.AVAILABLE
            elif r.status_code >= 300 or r.status_code < 200:
                query_status = QueryStatus.AVAILABLE
        elif error_type == "response_url":
            # For this detection method, we have turned off the redirect.
            # So, there is no need to check the response URL: it will always
            # match the request.  Instead, we will ensure that the response
            # code indicates that the request was successful (i.e. no 404, or
            # forward to some odd redirect).
            if 200 <= r.status_code < 300:
                query_status = QueryStatus.CLAIMED
            else:
                query_status = QueryStatus.AVAILABLE
        else:
-            # It should be impossible to ever get here...
+            if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type):
-            raise ValueError(
+                error_context = f"Unknown error type '{error_type}' for {social_network}"
-                f"Unknown Error Type '{error_type}' for " f"site '{social_network}'"
+                query_status = QueryStatus.UNKNOWN
-            )
+            else:
                if "message" in error_type:
                    # error_flag True denotes no error found in the HTML
                    # error_flag False denotes error found in the HTML
                    error_flag = True
                    errors = net_info.get("errorMsg")
                    # errors will hold the error message
                    # it can be string or list
                    # by isinstance method we can detect that
                    # and handle the case for strings as normal procedure
                    # and if its list we can iterate the errors
                    if isinstance(errors, str):
                        # Checks if the error message is in the HTML
                        # if error is present we will set flag to False
                        if errors in r.text:
                            error_flag = False
                    else:
                        # If it's list, it will iterate all the error message
                        for error in errors:
                            if error in r.text:
                                error_flag = False
                                break
                    if error_flag:
                        query_status = QueryStatus.CLAIMED
                    else:
                        query_status = QueryStatus.AVAILABLE
                if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE:
                    error_codes = net_info.get("errorCode")
                    query_status = QueryStatus.CLAIMED
                    # Type consistency, allowing for both singlets and lists in manifest
                    if isinstance(error_codes, int):
                        error_codes = [error_codes]
                    if error_codes is not None and r.status_code in error_codes:
                        query_status = QueryStatus.AVAILABLE
                    elif r.status_code >= 300 or r.status_code < 200:
                        query_status = QueryStatus.AVAILABLE
                if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE:
                    # For this detection method, we have turned off the redirect.
                    # So, there is no need to check the response URL: it will always
                    # match the request.  Instead, we will ensure that the response
                    # code indicates that the request was successful (i.e. no 404, or
                    # forward to some odd redirect).
                    if 200 <= r.status_code < 300:
                        query_status = QueryStatus.CLAIMED
                    else:
                        query_status = QueryStatus.AVAILABLE
        if dump_response:
            print("+++++++++++++++++++++")
@@ -596,22 +571,6 @@ def main():
        dest="output",
        help="If using single username, the output of the result will be saved to this file.",
    )
    parser.add_argument(
        "--tor",
        "-t",
        action="store_true",
        dest="tor",
        default=False,
        help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.",
    )
    parser.add_argument(
        "--unique-tor",
        "-u",
        action="store_true",
        dest="unique_tor",
        default=False,
        help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.",
    )
    parser.add_argument(
        "--csv",
        action="store_true",
@@ -720,11 +679,11 @@ def main():
    )
    parser.add_argument(
-        "--no-txt",
+        "--txt",
        action="store_true",
-        dest="no_txt",
+        dest="output_txt",
        default=False,
-        help="Disable creation of a txt file",
+        help="Enable creation of a txt file",
    )
    parser.add_argument(
@@ -742,7 +701,7 @@ def main():
    # Check for newer version of Sherlock. If it exists, let the user know about it
    try:
-        latest_release_raw = requests.get(forge_api_latest_release).text
+        latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text
        latest_release_json = json_loads(latest_release_raw)
        latest_remote_tag = latest_release_json["tag_name"]
@@ -755,22 +714,10 @@ def main():
    except Exception as error:
        print(f"A problem occurred while checking for an update: {error}")
    # Argument check
    # TODO regex check on args.proxy
    if args.tor and (args.proxy is not None):
        raise Exception("Tor and Proxy cannot be set at the same time.")
    # Make prompts
    if args.proxy is not None:
        print("Using the proxy: " + args.proxy)
    if args.tor or args.unique_tor:
        print("Using Tor to make requests")
        print(
            "Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors."
        )
    if args.no_color:
        # Disable color output.
        init(strip=True, convert=False)
@@ -802,7 +749,7 @@ def main():
                if args.json_file.isnumeric():
                    pull_number = args.json_file
                    pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}"
-                    pull_request_raw = requests.get(pull_url).text
+                    pull_request_raw = requests.get(pull_url, timeout=10).text
                    pull_request_json = json_loads(pull_request_raw)
                    # Check if it's a valid pull request
@@ -871,8 +818,6 @@ def main():
            username,
            site_data,
            query_notify,
            tor=args.tor,
            unique_tor=args.unique_tor,
            dump_response=args.dump_response,
            proxy=args.proxy,
            timeout=args.timeout,
@@ -888,7 +833,7 @@ def main():
        else:
            result_file = f"{username}.txt"
-        if not args.no_txt:
+        if args.output_txt:
            with open(result_file, "w", encoding="utf-8") as file:
                exists_counter = 0
                for website_name in results:
@@ -973,8 +918,8 @@ def main():
                {
                    "username": usernames,
                    "name": names,
-                    "url_main": url_main,
+                    "url_main": [f'=HYPERLINK(\"{u}\")' for u in url_main],
-                    "url_user": url_user,
+                    "url_user": [f'=HYPERLINK(\"{u}\")' for u in url_user],
                    "exists": exists,
                    "http_status": http_status,
                    "response_time_s": response_time_s,
@@ -8,7 +8,7 @@ import requests
 import secrets
-MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
+MANIFEST_URL = "https://data.sherlockproject.xyz"
 EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
 class SiteInformation:
@@ -121,15 +121,10 @@ class SitesInformation:
            # users from creating issue about false positives which has already been fixed or having outdated data
            data_file_path = MANIFEST_URL
        # Ensure that specified data file has correct extension.
        if not data_file_path.lower().endswith(".json"):
            raise FileNotFoundError(f"Incorrect JSON file extension for data file '{data_file_path}'.")
        # if "http://"  == data_file_path[:7].lower() or "https://" == data_file_path[:8].lower():
        if data_file_path.lower().startswith("http"):
            # Reference is to a URL.
            try:
-                response = requests.get(url=data_file_path)
+                response = requests.get(url=data_file_path, timeout=30)
            except Exception as error:
                raise FileNotFoundError(
                    f"Problem while attempting to access data file URL '{data_file_path}':  {error}"
@@ -166,7 +161,7 @@ class SitesInformation:
        if honor_exclusions:
            try:
-                response = requests.get(url=EXCLUSIONS_URL)
+                response = requests.get(url=EXCLUSIONS_URL, timeout=10)
                if response.status_code == 200:
                    exclusions = response.text.splitlines()
                    exclusions = [exclusion.strip() for exclusion in exclusions]
@@ -0,0 +1,47 @@
 """Tests for handling usernames with special/unicode characters."""
 from concurrent.futures import Future
 from sherlock_project.sherlock import get_response
 def _make_future_with_exception(exc):
    """Create a Future that raises the given exception."""
    future = Future()
    future.set_exception(exc)
    return future
 def test_get_response_handles_unicode_decode_error():
    """Regression test for issue #2730.
    Usernames with special characters (e.g. 'Émile') can trigger a
    UnicodeDecodeError inside the requests library during redirect
    handling. This must not crash the program.
    """
    future = _make_future_with_exception(
        UnicodeDecodeError("utf-8", b"\xe9", 0, 1, "invalid continuation byte")
    )
    response, error_context, exception_text = get_response(
        request_future=future,
        error_type=["status_code"],
        social_network="TestSite",
    )
    assert response is None
    assert error_context == "Encoding Error"
    assert "utf-8" in exception_text
 def test_get_response_handles_unicode_encode_error():
    """UnicodeEncodeError should also be caught (subclass of UnicodeError)."""
    future = _make_future_with_exception(
        UnicodeEncodeError("ascii", "É", 0, 1, "ordinal not in range(128)")
    )
    response, error_context, exception_text = get_response(
        request_future=future,
        error_type=["status_code"],
        social_network="TestSite",
    )
    assert response is None
    assert error_context == "Encoding Error"
    assert "ascii" in exception_text
@@ -4,7 +4,7 @@ from sherlock_interactives import Interactives
 from sherlock_interactives import InteractivesSubprocessError
 def test_remove_nsfw(sites_obj):
-    nsfw_target: str = 'Pornhub'
+    nsfw_target: str = 'Xvideos'
    assert nsfw_target in {site.name: site.information for site in sites_obj}
    sites_obj.remove_nsfw_sites()
    assert nsfw_target not in {site.name: site.information for site in sites_obj}
@@ -12,8 +12,8 @@ def test_remove_nsfw(sites_obj):
 # Parametrized sites should *not* include Motherless, which is acting as the control
@pytest.mark.parametrize('nsfwsites', [
-    ['Pornhub'],
+    ['Xvideos'],
-    ['Pornhub', 'Xvideos'],
+    ['Xvideos', 'Erome'],
 ])
 def test_nsfw_explicit_selection(sites_obj, nsfwsites):
    for site in nsfwsites:
@@ -16,6 +16,7 @@ def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUAN
    """Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`."""
    def replace_upper_bound(match: re.Match) -> str: # type: ignore
        lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore
        nonlocal upper_bound
        upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore  # noqa: F823
        return f'{{{lower_bound},{upper_bound}}}'