From 746ab5961c023450f7774d568e052cee44946765 Mon Sep 17 00:00:00 2001 From: "Alex Xu (Hello71)" Date: Tue, 5 Oct 2021 15:22:21 -0400 Subject: get rid of web workers, refactor --- wfs.py | 259 ++++++++++++++++++++++++----------------------------------------- 1 file changed, 95 insertions(+), 164 deletions(-) diff --git a/wfs.py b/wfs.py index 08ce689..e67efbe 100755 --- a/wfs.py +++ b/wfs.py @@ -4,211 +4,142 @@ import argparse import logging import multiprocessing import os -import signal +import pathlib import sys -from itertools import chain -from multiprocessing import Pool -from multiprocessing.util import Finalize -from pathlib import Path -from urllib.parse import urlparse - from fontTools.subset import Options, Subsetter, load_font from selenium import webdriver -logging.basicConfig(format='[%(relativeCreated)d] %(message)s') -logger = logging.getLogger('websubset') +logging.basicConfig(format='%(levelname)s: %(message)s') +logger = logging.getLogger('wfs') logger.setLevel(logging.INFO) -EXTRACT_SCRIPT = r''' - let whitelist = new Set(arguments[0]); - let walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT); - let node, dict = {}; - while (node = walker.nextNode()) { - let cs = getComputedStyle(node.parentNode); - let css = k => cs.getPropertyValue(k); - if (css('display') == 'none') - continue; - let k = css('font-family').replace(/"/g, '').replace(/,.*/, '') + ';' + - css('font-weight') + ';' + - css('font-style'); - if (!whitelist.has(k)) - continue; - if (!(k in dict)) - dict[k] = ''; - dict[k] += node.nodeValue; - } - return dict; -''' - -def gen_font_face(font): - if 'fontfile' not in font: - return '' - return ''.join([ - '@font-face{', - 'font-family:"', font['family'], '";', - 'font-weight:', font['weight'], ';', - 'font-style:', font['style'], ';', - 'src: url("', font["fontfile"], '");', - '}']) - -DRIVER = None -def stop_driver(): - global DRIVER - if DRIVER: - DRIVER.quit() - DRIVER = None -def hook_sig(signum): - orig_handler = signal.getsignal(signum) - if orig_handler is None: - raise Exception('{signum} handler is None') - def term_handler(*_): - stop_driver() - signal.signal(signum, orig_handler) - os.kill(os.getpid(), signum) - signal.signal(signum, term_handler) -def start_wworker(driver_name): - # clamp selenium wasteful sleeps to 0.1s - import time - from time import sleep - time.sleep = lambda secs: sleep(min(secs, 0.1)) +def make_uri(path): + if ':' in path: + return path + return pathlib.Path(path).resolve().as_uri() - hook_sig(signal.SIGTERM) - global DRIVER +def start_driver(driver_name): if driver_name == 'chrome': chrome_options = webdriver.chrome.options.Options() chrome_options.headless = True chrome_options.experimental_options["prefs"] = { "profile.default_content_setting_values.images": 2 } - DRIVER = webdriver.Chrome(options=chrome_options, desired_capabilities={'detach': True}) - elif driver_name == 'firefox': + return webdriver.Chrome(options=chrome_options) + if driver_name == 'firefox': firefox_profile = webdriver.FirefoxProfile() firefox_profile.set_preference('permissions.default.image', 2) firefox_options = webdriver.firefox.options.Options() firefox_options.headless = True - DRIVER = webdriver.Firefox(firefox_profile=firefox_profile, options=firefox_options) - else: - raise Exception('unknown driver name') - Finalize(DRIVER, stop_driver, exitpriority=16) - -def is_uri(path): - parsed = urlparse(path) - return parsed.scheme and parsed.netloc -def make_uri(path): - if is_uri(path): - return path - else: - return Path(path).resolve().as_uri() - -def extract(path, fonts, screenshots): - logger.info('fetching %s', path) - DRIVER.get(make_uri(path)) - if screenshots: - logger.info('replacing fonts for %s', path) - height = DRIVER.execute_script(''.join([ - "let style = document.createElement('style'); style.innerHTML = '", - ''.join(gen_font_face(font) for font in fonts), - "'; document.body.appendChild(style); return document.documentElement.scrollHeight"])) - logger.info('taking pre-screenshot for %s', path) - DRIVER.set_window_size(1920, height) - screenshot = DRIVER.get_screenshot_as_png() - else: - screenshot = None - logger.info('extracting text from %s', path) - whitelist = [';'.join((f['family'], f['weight'], f['style'])) for f in fonts] - return (path, DRIVER.execute_script(EXTRACT_SCRIPT, whitelist), screenshot) + return webdriver.Firefox(firefox_profile=firefox_profile, options=firefox_options) + raise Exception('unknown driver name') -def get_fontdesc(fonts, fontspec): - font_match = dict(zip(('family', 'weight', 'style'), fontspec.split(';'))) - for font in fonts: - if font_match.items() <= font.items(): - return font - return None - -def subset(fontdesc, text, fts_opts): - fontfile = fontdesc['fontfile'] +def subset(fontfile, text, fts_opts): logger.info('subsetting %s', fontfile) font = load_font(fontfile, fts_opts, dontLoadGlyphNames=True) subsetter = Subsetter(options=fts_opts) subsetter.populate(text=text) subsetter.subset(font) - ret = [] - outfile = fontfile[:fontfile.rindex('.')] + '.subset.woff2' - ret.append((font, 'woff2', outfile)) - return ret - -def write_subset(font, flavor, outfile): + font.flavor = 'woff2' + outfile = fontfile[:fontfile.rindex('.')] + '.subset.' + font.flavor logger.info('writing %s', outfile) - font.flavor = flavor font.save(outfile) -def verify(path, screenshot_begin_png): - logger.info('refetching %s', path) - DRIVER.get(make_uri(path)) - - from io import BytesIO - from PIL import Image, ImageChops - screenshot_begin = Image.open(BytesIO(screenshot_begin_png), formats=('PNG',)).convert('RGB') - - logger.info('taking post-screenshot for %s', path) - DRIVER.set_window_size(*screenshot_begin.size) - screenshot_end_png = DRIVER.get_screenshot_as_png() - - screenshot_end = Image.open(BytesIO(screenshot_end_png), formats=('PNG',)).convert('RGB') - logger.info('checking %s screenshots for %s', 'x'.join(map(str, screenshot_end.size)), path) - if ImageChops.difference(screenshot_begin, screenshot_end).getbbox(): - raise Exception(f'screenshots do not match for {path}') - def main(argv): - parser = argparse.ArgumentParser(description='Web Font Subsetter', epilog='see pyftsubset --help for additional options') - parser.add_argument('--driver', help='selenium driver name (chrome or firefox)', default='chrome') + parser = argparse.ArgumentParser(description='Web Font Subsetter', + epilog='see pyftsubset --help for additional options') + parser.add_argument('--driver', + help='selenium driver name (chrome or firefox)', default='chrome') parser.add_argument('--no-screenshots', help='skip screenshot validation', action='store_true') parser.add_argument('--font', help='add font (fontfile:family:weight:style)', action='append') parser.add_argument('file', help='html files', nargs='+') args, leftover = parser.parse_known_intermixed_args(argv) options = Options() files = args.file + options.parse_opts(leftover) - if any([file[0] == '-' for file in files]): + if any(file[0] == '-' for file in files): parser.print_usage() raise Exception('bad arguments') - if options.with_zopfli: - from fontTools.ttLib import sfnt - sfnt.USE_ZOPFLI = True - ncpus = len(os.sched_getaffinity(0)) - fonts = [dict(zip(['fontfile', 'family', 'weight', 'style'], font.split(':'))) for font in args.font] - nwworkers = min(len(files), ncpus) - nfworkers = min(len(fonts), ncpus) - logger.info('using %d web workers, %d font workers', nwworkers, nfworkers) + if not args.no_screenshots: + from io import BytesIO + from PIL import Image, ImageChops + fonts = {} + for font in args.font: + fontlst = font.split(':') + fonts[(fontlst[1], fontlst[2] or '400', fontlst[3] or 'normal')] = fontlst[0] + + # clamp selenium wasteful sleeps + import time + sleep = time.sleep + time.sleep = lambda secs: sleep(min(secs, 0.1)) - with Pool(nfworkers) as fpool, \ - Pool(nwworkers, start_wworker, (args.driver,)) as wpool: - all_font_texts = {} + with start_driver(args.driver) as driver: + font_texts = {} screenshots = [] - extract_args = ((file, fonts, not args.no_screenshots) for file in args.file) - extracted = wpool.starmap(extract, extract_args) - for path, font_texts, screenshot in extracted: + for path in args.file: + logger.info('fetching %s', path) + driver.get(make_uri(path)) if not args.no_screenshots: - screenshots.append((path, screenshot)) - for fontspec, text in font_texts.items(): - if fontspec in all_font_texts: - all_font_texts[fontspec] |= set(text) + logger.info('replacing fonts for %s', path) + height = driver.execute_script(""" + let style = document.createElement('style'); + style.innerHTML = arguments[0]; + document.body.appendChild(style); + return document.documentElement.scrollHeight; + """, ''.join(f''' + @font-face{{ + font-family: "{fontdesc[0]}"; + font-weight: {fontdesc[1]}; + font-style: {fontdesc[2]}; + src: url({fontfile}); + }}''' for fontdesc, fontfile in fonts.items())) + logger.info('taking pre-screenshot for %s', path) + driver.set_window_size(1920, height) + screenshots.append((path, driver.get_screenshot_as_png())) + logger.info('extracting text from %s', path) + for fontstr, text in driver.execute_script(r''' + const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT); + let node, dict = {}; + while (node = walker.nextNode()) { + const cs = getComputedStyle(node.parentNode); + const css = k => cs.getPropertyValue(k); + if (css('display') == 'none') continue; + const k = css('font-family').replace(/"/g, '').replace(/,.*/, '') + ';' + + css('font-weight') + ';' + css('font-style'); + if (k in dict) dict[k] += node.nodeValue; + else dict[k] = node.nodeValue; + } + return dict; + ''').items(): + + fontspec = tuple(fontstr.split(';')) + if fontspec in font_texts: + font_texts[fontspec] |= set(text) else: - all_font_texts[fontspec] = set(text) + font_texts[fontspec] = set(text) if args.no_screenshots: - logger.info('shutting down web workers early') - wpool.close() - subset_args = ( - (get_fontdesc(fonts, fontspec), ''.join(text), options) - for fontspec, text in all_font_texts.items()) - subsetted = fpool.starmap(subset, subset_args) - fpool.starmap(write_subset, chain(*subsetted)) - if not args.no_screenshots: - wpool.starmap(verify, screenshots, 1) - if multiprocessing.active_children(): - logger.info('waiting for workers') - for proc in multiprocessing.active_children(): - proc.join() + logger.info('shutting down driver') + driver.close() + with multiprocessing.Pool(min(len(fonts), len(os.sched_getaffinity(0)))) as fpool: + jobs = [] + for fontspec, text in font_texts.items(): + try: + jobs.append((fonts[fontspec], ''.join(text), options)) + except KeyError: + logger.warning('missing font %s', fontspec) + fpool.starmap(subset, jobs) + while screenshots: + path, start_png = screenshots.pop() + start = Image.open(BytesIO(start_png), formats=('PNG',)) + logger.info('checking %dx%d screenshot for %s', *start.size, path) + driver.get(make_uri(path)) + + logger.info('taking post-screenshot for %s', path) + driver.set_window_size(*start.size) + + end = Image.open(BytesIO(driver.get_screenshot_as_png()), formats=('PNG',)) + if ImageChops.difference(start.convert('RGB'), end.convert('RGB')).getbbox(): + raise Exception(f'screenshots do not match for {path}') logger.info('exiting successfully') if __name__ == '__main__': -- cgit v1.2.3-70-g09d2