415gotit

2026-02-21 18:27:49 +08:00
parent 0ac4b23f07
commit 5dc86ccfbf
270 changed files with 49508 additions and 4636 deletions
--- a/analyze_priority.py
+++ b/analyze_priority.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+从 chatgpt.com-*_analysis.json 中，按优先级对每个指纹字段评分排序。
+
+评分规则：
+  +10  bot 自动化检测专属字段（webdriver, $cdc_*, callPhantom 等）
+  + 5  出现在核心检测循环 tH=154 或 tH=155
+  + 2  每额外出现在一个不同 tH（跨 tH 频次）
+  + 3  属于已知高风险 API（Crypto, RTCPeerConnection, OfflineAudioContext 等）
+  + 1  属于 navigator / screen / canvas 系列
+"""
+
+import json
+import sys
+import glob
+from collections import defaultdict
+
+# ── 配置 ─────────────────────────────────────────────────────
+ANALYSIS_JSON = sorted(glob.glob(
+    "/home/carry/myprj/hcaptcha/asset/chatgpt.com-*_analysis.json"
+))[-1]
+
+# bot 自动化检测专属字段（出现即暴露）
+BOT_SIGNALS = {
+    "webdriver", "callPhantom", "callSelenium", "_selenium", "__phantomas",
+    "domAutomationController", "awesomium", "$wdc_", "domAutomation",
+    "_WEBDRIVER_ELEM_CACHE", "spawn", "__nightmare", "__webdriver_script_fn",
+    "__webdriver_script_func", "__driver_evaluate", "__webdriver_evaluate",
+    "__selenium_evaluate", "__fxdriver_evaluate", "__driver_unwrapped",
+    "__webdriver_unwrapped", "__selenium_unwrapped", "__fxdriver_unwrapped",
+    "hcaptchaCallbackZenno", "_Selenium_IDE_Recorder",
+    "cdc_adoQpoasnfa76pfcZLmcfl_Array",
+    "cdc_adoQpoasnfa76pfcZLmcfl_Promise",
+    "cdc_adoQpoasnfa76pfcZLmcfl_Symbol",
+    "CDCJStestRunStatus",
+    "$cdc_asdjflasutopfhvcZLmcfl_",
+    "$chrome_asyncScriptInfo",
+}
+
+# 高风险 API（指纹强度高）
+HIGH_RISK_APIS = {
+    "Crypto", "RTCPeerConnection", "OfflineAudioContext",
+    "CanvasRenderingContext2D", "HTMLCanvasElement", "WebGL2RenderingContext",
+    "WebGLRenderingContext", "IDBFactory", "PluginArray", "NavigatorUAData",
+    "PerformanceNavigationTiming", "PerformanceResourceTiming",
+}
+
+# navigator / screen / canvas 系列
+MEDIUM_APIS = {
+    "Navigator", "Screen", "Storage", "Performance", "HTMLDocument",
+    "ScreenOrientation", "NetworkInformation", "languages", "maxTouchPoints",
+    "webdriver", "platform", "userAgent",
+}
+
+# 核心检测循环 tH
+CORE_TH = {154, 155}
+
+
+# ── 加载 ────────────────────────────────────────────────────
+def load(path):
+    with open(path, encoding="utf-8") as f:
+        return json.load(f)
+
+
+# ── 评分 ────────────────────────────────────────────────────
+def score(data):
+    # api -> {tH set, score, reasons}
+    api_info = defaultdict(lambda: {"tH_set": set(), "score": 0, "reasons": []})
+
+    for tH_str, entry in data.items():
+        tH = int(tH_str)
+        for ig in entry.get("ig_values", []):
+            if not isinstance(ig, str):
+                continue
+            # 跳过明显是"值"而非 API 名的字符串
+            if ig.startswith("0,1,2") or ig.startswith("1:") or \
+               ig.startswith("#") or ig.startswith("return ") or \
+               ig.startswith("https://") or len(ig) > 80:
+                continue
+
+            info = api_info[ig]
+            info["tH_set"].add(tH)
+
+    # 计算分数
+    for api, info in api_info.items():
+        s = 0
+        reasons = []
+
+        # bot 信号
+        if api in BOT_SIGNALS:
+            s += 10
+            reasons.append("🚨 bot检测字段 +10")
+
+        # 核心检测循环
+        core_hit = info["tH_set"] & CORE_TH
+        if core_hit:
+            s += 5
+            reasons.append(f"🎯 核心循环 tH={sorted(core_hit)} +5")
+
+        # 高风险 API
+        if api in HIGH_RISK_APIS:
+            s += 3
+            reasons.append("⚡ 高风险API +3")
+
+        # 中等 API
+        if api in MEDIUM_APIS:
+            s += 1
+            reasons.append("📡 navigator/screen类 +1")
+
+        # 跨 tH 频次（每多一个 tH +2）
+        freq = len(info["tH_set"])
+        if freq > 1:
+            bonus = (freq - 1) * 2
+            s += bonus
+            reasons.append(f"🔁 跨{freq}个tH +{bonus}")
+
+        info["score"] = s
+        info["reasons"] = reasons
+
+    return api_info
+
+
+# ── 输出 ─────────────────────────────────────────────────────
+def report(api_info):
+    # 按分数排序
+    ranked = sorted(api_info.items(), key=lambda x: -x[1]["score"])
+
+    print("=" * 70)
+    print("  HSW 指纹字段 优先级排名")
+    print("=" * 70)
+
+    # 分档
+    tiers = [
+        ("🔴 P0  必须正确（≥10分）", lambda s: s >= 10),
+        ("🟠 P1  高优先级（5~9分）",  lambda s: 5 <= s < 10),
+        ("🟡 P2  中优先级（3~4分）",  lambda s: 3 <= s < 5),
+        ("🟢 P3  低优先级（1~2分）",  lambda s: 1 <= s < 3),
+        ("⚪ P4  可忽略（0分）",       lambda s: s == 0),
+    ]
+
+    for tier_label, condition in tiers:
+        tier_items = [(api, info) for api, info in ranked if condition(info["score"])]
+        if not tier_items:
+            continue
+        print(f"\n{tier_label}  [{len(tier_items)} 个]")
+        print(f"  {'分数':<5}  {'字段名':<45}  出现tH")
+        print(f"  {'─'*5}  {'─'*45}  {'─'*20}")
+        for api, info in tier_items:
+            tH_list = ",".join(str(t) for t in sorted(info["tH_set"]))
+            print(f"  {info['score']:<5}  {api:<45}  tH={tH_list}")
+            for r in info["reasons"]:
+                print(f"          {r}")
+
+    # 导出 JSON
+    out = {
+        api: {
+            "score": info["score"],
+            "tH_list": sorted(info["tH_set"]),
+            "reasons": info["reasons"],
+        }
+        for api, info in ranked
+    }
+    out_path = ANALYSIS_JSON.replace("_analysis.json", "_priority.json")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(out, f, ensure_ascii=False, indent=2)
+    print(f"\n📄 优先级结果已写入: {out_path}")
+
+
+# ── 入口 ─────────────────────────────────────────────────────
+if __name__ == "__main__":
+    path = sys.argv[1] if len(sys.argv) > 1 else ANALYSIS_JSON
+    print(f"📂 读取: {path}\n")
+    data = load(path)
+    api_info = score(data)
+    report(api_info)