vr-shopxo-plugin/strip_page.py

206 lines
8.1 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
strip_page.py - HTML 脱脂脚本 v2
修复全面处理所有格式的 base64 图片无引号/双引号/单引号/url()/style属性
"""
import re
import os
import sys
from pathlib import Path
STRIP_THRESHOLD_STYLE_CHARS = 50000
BASE64_PLACEHOLDER = 'data:image/png;base64,iVBORw0KGgo='
def extract_css_variables(style_text):
vars_found = []
for m in re.finditer(r'(--[\w-]+)\s*:\s*([^;{]+?)(?:\s*!important)?;', style_text):
vars_found.append(f' {m.group(1)}: {m.group(2).strip()};')
return vars_found
def strip_html(input_path, output_dir=None):
input_path = Path(input_path)
if output_dir is None:
output_dir = input_path.parent
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
stem = input_path.stem
html_out = output_dir / f'{stem}.stripped.html'
css_out = output_dir / f'{stem}.styles.css'
vars_out = output_dir / f'{stem}.variables.css'
with open(input_path, 'r', encoding='utf-8', errors='replace') as f:
html = f.read()
original_size = len(html)
stats = {
'base64_replaced': 0,
'base64_saved_chars': 0,
'styles_extracted': 0,
'styles_saved_chars': 0,
'css_vars_extracted': 0,
'scripts_removed': 0,
'data_hrefs_fixed': 0,
}
# ── 0. 扫描所有 CSS 变量(先于任何替换)────────────
all_vars = []
for m in re.finditer(r'<style[^>]*>(.*?)</style>', html, flags=re.DOTALL):
all_vars.extend(extract_css_variables(m.group(1)))
# 也从 style 属性中提取
for m in re.finditer(r'style="([^"]+)"', html):
all_vars.extend(extract_css_variables(m.group(1)))
# ── 1. 全面替换所有 base64 图片 ─────────────────
# 格式1: url(data:image/...) 无引号
def repl_url_nq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f'url("{BASE64_PLACEHOLDER}")'
n = len(re.findall(r'url\((data:image[^\s\)]+)\)', html))
html = re.sub(r'url\((data:image[^\s\)]+)\)', repl_url_nq, html)
# 格式2: url("data:image/...") 双引号
def repl_url_dq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f'url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'url\("(data:image[^"]+)"\)', repl_url_dq, html)
# 格式3: url('data:image/...') 单引号
def repl_url_sq(m):
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(1))
return f"url('{BASE64_PLACEHOLDER}')"
html = re.sub(r"url\('(data:image[^']+)'\)", repl_url_sq, html)
# 格式4: style="background-image:url(data:...)" style属性中无引号
def repl_style_nq(m):
prefix = m.group(1)
data = m.group(2)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(data)
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'(background-image\s*:\s*)url\((data:image[^\s\)]+)\)', repl_style_nq, html, flags=re.IGNORECASE)
# 格式5: style="background-image:url("data:...")" style属性中有引号
def repl_style_dq(m):
prefix = m.group(1)
data = m.group(2)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(data)
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
html = re.sub(r'(background-image\s*:\s*)url\("(data:image[^"]+)"\)', repl_style_dq, html, flags=re.IGNORECASE)
# 格式6: <img src="data:image/...">
def repl_img(m):
attrs = m.group(1)
stats['base64_replaced'] += 1
stats['base64_saved_chars'] += len(m.group(2))
return f'<img{attrs}src="{BASE64_PLACEHOLDER}"'
html = re.sub(r'<img([^>]*?)src="(data:image/[^"]+)"', repl_img, html, flags=re.IGNORECASE)
# ── 2. 处理极大的内联 style ──────────────────────
extracted_styles = []
def process_style(m):
tag = m.group(1)
attrs = m.group(2)
content = m.group(3)
if len(content) < STRIP_THRESHOLD_STYLE_CHARS:
return m.group(0)
stats['styles_extracted'] += 1
stats['styles_saved_chars'] += len(content)
all_vars.extend(extract_css_variables(content))
style_id = f'extracted-style-{stats["styles_extracted"]}'
html_tag = f'<link rel="stylesheet" id="{style_id}" href="{stem}.styles.css">'
extracted_styles.append((style_id, content))
return f'<!-- {style_id} moved to external CSS -->'
html = re.sub(r'(<style)([^>]*?)>(.*?)</style>', process_style, html, flags=re.DOTALL)
# ── 3. 写外置 CSS ────────────────────────────────
if extracted_styles:
css_parts = ['/* === Extracted inline styles === */', '']
for style_id, content in extracted_styles:
css_parts.append(f'/* --- {style_id} --- */')
css_parts.append(content)
css_parts.append('')
css_out.write_text('\n'.join(css_parts), encoding='utf-8')
# ── 4. 写变量文件 ────────────────────────────────
if all_vars:
seen = set()
unique_vars = []
for v in all_vars:
key = v.strip().split(':')[0]
if key not in seen and key.startswith('--'):
seen.add(key)
unique_vars.append(v)
vars_out.write_text(
'/* === Extracted CSS Variables === */\n:root {\n' +
'\n'.join(unique_vars) +
'\n}\n',
encoding='utf-8'
)
stats['css_vars_extracted'] = len(unique_vars)
# ── 5. 移除 inline script ────────────────────────
stats['scripts_removed'] = len(re.findall(r'<script[^>]*>.*?</script>', html, flags=re.DOTALL))
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
html = re.sub(r'<script[^>]*/>\s*', '', html)
# ── 6. 清理 data: href ──────────────────────────
stats['data_hrefs_fixed'] = len(re.findall(r'href="data:[^"]+"', html))
html = re.sub(r'href="(data:[^"]+)"', 'href="#data-link-removed"', html)
# ── 7. 写脱脂 HTML ──────────────────────────────
html = re.sub(r'<!--\s*-->', '', html)
if extracted_styles and '<link rel="stylesheet"' not in html:
html = html.replace('<head>', f'<head>\n <link rel="stylesheet" href="{stem}.styles.css">', 1)
html_out.write_text(html, encoding='utf-8')
new_html_size = len(html)
css_size = css_out.stat().st_size if css_out.exists() else 0
vars_size = vars_out.stat().st_size if vars_out.exists() else 0
saved = original_size - new_html_size - css_size
print(f'''
脱脂完成
{'='*54}
📄 输入: {input_path.name}
原始大小: {original_size/1024:.1f} KB ({original_size:,} 字符)
{'='*54}
📊 处理结果:
base64 图片替换: {stats["base64_replaced"]}
base64 节省: {stats["base64_saved_chars"]/1024:.1f} KB
style 块提取: {stats["styles_extracted"]}
style 节省: {stats["styles_saved_chars"]/1024:.1f} KB
CSS 变量提取: {stats["css_vars_extracted"]}
script 移除: {stats["scripts_removed"]}
data href 修复: {stats["data_hrefs_fixed"]}
{'='*54}
📦 输出文件:
HTML (脱脂): {html_out.name} ({new_html_size/1024:.1f} KB)
CSS (提取样式): {css_out.name} ({css_size/1024:.1f} KB)
CSS (变量): {vars_out.name} ({vars_size/1024:.1f} KB)
{'='*54}
💾 HTML 体积压缩: {new_html_size/1024:.1f} KB (原始 {original_size/1024:.1f} KB)
节省比例: {(saved)/original_size*100:.1f}%
''')
if __name__ == '__main__':
if len(sys.argv) < 2:
print('用法: python3 strip_page.py <input.html> [output_dir]')
sys.exit(1)
strip_html(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)