hcapEnv/analyze_priority.py

#!/usr/bin/env python3
"""
从 chatgpt.com-*_analysis.json 中，按优先级对每个指纹字段评分排序。

评分规则：
  +10  bot 自动化检测专属字段（webdriver, $cdc_*, callPhantom 等）
  + 5  出现在核心检测循环 tH=154 或 tH=155
  + 2  每额外出现在一个不同 tH（跨 tH 频次）
  + 3  属于已知高风险 API（Crypto, RTCPeerConnection, OfflineAudioContext 等）
  + 1  属于 navigator / screen / canvas 系列
"""

import json
import sys
import glob
from collections import defaultdict

# ── 配置 ─────────────────────────────────────────────────────
ANALYSIS_JSON = sorted(glob.glob(
    "/home/carry/myprj/hcaptcha/asset/chatgpt.com-*_analysis.json"
))[-1]

# bot 自动化检测专属字段（出现即暴露）
BOT_SIGNALS = {
    "webdriver", "callPhantom", "callSelenium", "_selenium", "__phantomas",
    "domAutomationController", "awesomium", "$wdc_", "domAutomation",
    "_WEBDRIVER_ELEM_CACHE", "spawn", "__nightmare", "__webdriver_script_fn",
    "__webdriver_script_func", "__driver_evaluate", "__webdriver_evaluate",
    "__selenium_evaluate", "__fxdriver_evaluate", "__driver_unwrapped",
    "__webdriver_unwrapped", "__selenium_unwrapped", "__fxdriver_unwrapped",
    "hcaptchaCallbackZenno", "_Selenium_IDE_Recorder",
    "cdc_adoQpoasnfa76pfcZLmcfl_Array",
    "cdc_adoQpoasnfa76pfcZLmcfl_Promise",
    "cdc_adoQpoasnfa76pfcZLmcfl_Symbol",
    "CDCJStestRunStatus",
    "$cdc_asdjflasutopfhvcZLmcfl_",
    "$chrome_asyncScriptInfo",
}

# 高风险 API（指纹强度高）
HIGH_RISK_APIS = {
    "Crypto", "RTCPeerConnection", "OfflineAudioContext",
    "CanvasRenderingContext2D", "HTMLCanvasElement", "WebGL2RenderingContext",
    "WebGLRenderingContext", "IDBFactory", "PluginArray", "NavigatorUAData",
    "PerformanceNavigationTiming", "PerformanceResourceTiming",
}

# navigator / screen / canvas 系列
MEDIUM_APIS = {
    "Navigator", "Screen", "Storage", "Performance", "HTMLDocument",
    "ScreenOrientation", "NetworkInformation", "languages", "maxTouchPoints",
    "webdriver", "platform", "userAgent",
}

# 核心检测循环 tH
CORE_TH = {154, 155}


# ── 加载 ────────────────────────────────────────────────────
def load(path):
    with open(path, encoding="utf-8") as f:
        return json.load(f)


# ── 评分 ────────────────────────────────────────────────────
def score(data):
    # api -> {tH set, score, reasons}
    api_info = defaultdict(lambda: {"tH_set": set(), "score": 0, "reasons": []})

    for tH_str, entry in data.items():
        tH = int(tH_str)
        for ig in entry.get("ig_values", []):
            if not isinstance(ig, str):
                continue
            # 跳过明显是"值"而非 API 名的字符串
            if ig.startswith("0,1,2") or ig.startswith("1:") or \
               ig.startswith("#") or ig.startswith("return ") or \
               ig.startswith("https://") or len(ig) > 80:
                continue

            info = api_info[ig]
            info["tH_set"].add(tH)

    # 计算分数
    for api, info in api_info.items():
        s = 0
        reasons = []

        # bot 信号
        if api in BOT_SIGNALS:
            s += 10
            reasons.append("🚨 bot检测字段 +10")

        # 核心检测循环
        core_hit = info["tH_set"] & CORE_TH
        if core_hit:
            s += 5
            reasons.append(f"🎯 核心循环 tH={sorted(core_hit)} +5")

        # 高风险 API
        if api in HIGH_RISK_APIS:
            s += 3
            reasons.append("⚡ 高风险API +3")

        # 中等 API
        if api in MEDIUM_APIS:
            s += 1
            reasons.append("📡 navigator/screen类 +1")

        # 跨 tH 频次（每多一个 tH +2）
        freq = len(info["tH_set"])
        if freq > 1:
            bonus = (freq - 1) * 2
            s += bonus
            reasons.append(f"🔁 跨{freq}个tH +{bonus}")

        info["score"] = s
        info["reasons"] = reasons

    return api_info


# ── 输出 ─────────────────────────────────────────────────────
def report(api_info):
    # 按分数排序
    ranked = sorted(api_info.items(), key=lambda x: -x[1]["score"])

    print("=" * 70)
    print("  HSW 指纹字段 优先级排名")
    print("=" * 70)

    # 分档
    tiers = [
        ("🔴 P0  必须正确（≥10分）", lambda s: s >= 10),
        ("🟠 P1  高优先级（5~9分）",  lambda s: 5 <= s < 10),
        ("🟡 P2  中优先级（3~4分）",  lambda s: 3 <= s < 5),
        ("🟢 P3  低优先级（1~2分）",  lambda s: 1 <= s < 3),
        ("⚪ P4  可忽略（0分）",       lambda s: s == 0),
    ]

    for tier_label, condition in tiers:
        tier_items = [(api, info) for api, info in ranked if condition(info["score"])]
        if not tier_items:
            continue
        print(f"\n{tier_label}  [{len(tier_items)} 个]")
        print(f"  {'分数':<5}  {'字段名':<45}  出现tH")
        print(f"  {'─'*5}  {'─'*45}  {'─'*20}")
        for api, info in tier_items:
            tH_list = ",".join(str(t) for t in sorted(info["tH_set"]))
            print(f"  {info['score']:<5}  {api:<45}  tH={tH_list}")
            for r in info["reasons"]:
                print(f"          {r}")

    # 导出 JSON
    out = {
        api: {
            "score": info["score"],
            "tH_list": sorted(info["tH_set"]),
            "reasons": info["reasons"],
        }
        for api, info in ranked
    }
    out_path = ANALYSIS_JSON.replace("_analysis.json", "_priority.json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    print(f"\n📄 优先级结果已写入: {out_path}")


# ── 入口 ─────────────────────────────────────────────────────
if __name__ == "__main__":
    path = sys.argv[1] if len(sys.argv) > 1 else ANALYSIS_JSON
    print(f"📂 读取: {path}\n")
    data = load(path)
    api_info = score(data)
    report(api_info)