shaka-packager/tools/site_compare/commands/maskmaker.py

273 lines
10 KiB
Python

# Copyright (c) 2011 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Component for automatically creating masks of changing areas of a website.
Works by repeated invokation of a browser and scraping of the resulting page.
Areas that differ will be added to the auto-generated mask. The mask generator
considers the mask complete when further scrapes fail to produce any differences
in the mask.
"""
import os # Functions for walking the directory tree
import tempfile # Get a temporary directory to hold intermediates
import time # Used for sleep() and naming masks by time
import command_line
import drivers
from PIL import Image
from PIL import ImageChops
import scrapers
def CreateCommand(cmdline):
"""Inserts the command and arguments into a command line for parsing."""
cmd = cmdline.AddCommand(
["maskmaker"],
"Automatically generates a mask from a list of URLs",
ValidateMaskmaker,
ExecuteMaskmaker)
cmd.AddArgument(
["-bp", "--browserpath"], "Full path to browser's executable",
type="readfile", metaname="PATH")
cmd.AddArgument(
["-b", "--browser"], "Which browser to use", type="string",
default="chrome")
cmd.AddArgument(
["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
cmd.AddArgument(
["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
required=True)
cmd.AddArgument(
["-u", "--url"], "URL to compare")
cmd.AddArgument(
["-l", "--list"], "List of URLs to compare", type="readfile")
cmd.AddMutualExclusion(["--url", "--list"])
cmd.AddArgument(
["-s", "--startline"], "First line of URL list", type="int")
cmd.AddArgument(
["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
cmd.AddArgument(
["-c", "--count"], "Number of lines of URL file to use", type="int")
cmd.AddDependency("--startline", "--list")
cmd.AddRequiredGroup(["--url", "--list"])
cmd.AddDependency("--endline", "--list")
cmd.AddDependency("--count", "--list")
cmd.AddMutualExclusion(["--count", "--endline"])
cmd.AddDependency("--count", "--startline")
cmd.AddArgument(
["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
"finish loading",
type="int", default=60)
cmd.AddArgument(
["-w", "--wait"],
"Amount of time (in seconds) to wait between successive scrapes",
type="int", default=60)
cmd.AddArgument(
["-sc", "--scrapes"],
"Number of successive scrapes which must result in no change to a mask "
"before mask creation is considered complete", type="int", default=10)
cmd.AddArgument(
["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
cmd.AddArgument(
["-gu", "--giveup"],
"Number of times to scrape before giving up", type="int", default=50)
cmd.AddArgument(
["-th", "--threshhold"],
"Percentage of different pixels (0-100) above which the scrape will be"
"discarded and the mask not updated.", type="int", default=100)
cmd.AddArgument(
["--er", "--errors"],
"Number of times a scrape can fail before giving up on the URL.",
type="int", default=1)
def ValidateMaskmaker(command):
"""Validate the arguments to maskmaker. Raises ParseError if failed."""
executables = [".exe", ".com", ".bat"]
if command["--browserpath"]:
if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
raise command_line.ParseError("Browser filename must be an executable")
def ExecuteMaskmaker(command):
"""Performs automatic mask generation."""
# Get the list of URLs to generate masks for
class MaskmakerURL(object):
"""Helper class for holding information about a URL passed to maskmaker."""
__slots__ = ['url', 'consecutive_successes', 'errors']
def __init__(self, url):
self.url = url
self.consecutive_successes = 0
self.errors = 0
if command["--url"]:
url_list = [MaskmakerURL(command["--url"])]
else:
startline = command["--startline"]
if command["--count"]:
endline = startline+command["--count"]
else:
endline = command["--endline"]
url_list = [MaskmakerURL(url.strip()) for url in
open(command["--list"], "r").readlines()[startline:endline]]
complete_list = []
error_list = []
outdir = command["--outdir"]
scrapes = command["--scrapes"]
errors = command["--errors"]
size = command["--size"]
scrape_pass = 0
scrapedir = command["--scrapedir"]
if not scrapedir: scrapedir = tempfile.gettempdir()
# Get the scraper
scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))
# Repeatedly iterate through the list of URLs until either every URL has
# a successful mask or too many errors, or we've exceeded the giveup limit
while url_list and scrape_pass < command["--giveup"]:
# Scrape each URL
for url in url_list:
print "Processing %r..." % url.url
mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")
# Load the existing mask. This is in a loop so we can try to recover
# from error conditions
while True:
try:
mask = Image.open(mask_filename)
if mask.size != size:
print " %r already exists and is the wrong size! (%r vs %r)" % (
mask_filename, mask.size, size)
mask_filename = "%s_%r%s" % (
mask_filename[:-4], size, mask_filename[-4:])
print " Trying again as %r..." % mask_filename
continue
break
except IOError:
print " %r does not exist, creating" % mask_filename
mask = Image.new("1", size, 1)
mask.save(mask_filename)
# Find the stored scrape path
mask_scrape_dir = os.path.join(
scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
drivers.windowing.PreparePath(mask_scrape_dir)
# Find the baseline image
mask_scrapes = os.listdir(mask_scrape_dir)
mask_scrapes.sort()
if not mask_scrapes:
print " No baseline image found, mask will not be updated"
baseline = None
else:
baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))
mask_scrape_filename = os.path.join(mask_scrape_dir,
time.strftime("%y%m%d-%H%M%S.bmp"))
# Do the scrape
result = scraper.Scrape(
[url.url], mask_scrape_dir, size, (0, 0),
command["--timeout"], path=command["--browserpath"],
filename=mask_scrape_filename)
if result:
# Return value other than None means an error
print " Scrape failed with error '%r'" % result
url.errors += 1
if url.errors >= errors:
print " ** Exceeded maximum error count for this URL, giving up"
continue
# Load the new scrape
scrape = Image.open(mask_scrape_filename)
# Calculate the difference between the new scrape and the baseline,
# subject to the current mask
if baseline:
diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
mask.convert(scrape.mode))
# If the difference is none, there's nothing to update
if max(diff.getextrema()) == (0, 0):
print " Scrape identical to baseline, no change in mask"
url.consecutive_successes += 1
if url.consecutive_successes >= scrapes:
print " ** No change for %r scrapes, done!" % scrapes
else:
# convert the difference to black and white, then change all
# black pixels (where the scrape and the baseline were identical)
# to white, all others (where the scrape and the baseline differed)
# to black.
#
# Since the below command is a little unclear, here's how it works.
# 1. convert("L") converts the RGB image to grayscale
# 2. point() maps grayscale values (or the individual channels)
# of an RGB image) to different ones. Because it operates on
# individual channels, the grayscale conversion from step 1
# is necessary.
# 3. The "1" second parameter to point() outputs the result as
# a monochrome bitmap. If the original RGB image were converted
# directly to monochrome, PIL would dither it.
diff = diff.convert("L").point([255]+[0]*255, "1")
# count the number of different pixels
diff_pixels = diff.getcolors()[0][0]
# is this too much?
diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
if diff_pixel_percent > command["--threshhold"]:
print (" Scrape differed from baseline by %.2f percent, ignoring"
% diff_pixel_percent)
else:
print " Scrape differed in %d pixels, updating mask" % diff_pixels
mask = ImageChops.multiply(mask, diff)
mask.save(mask_filename)
# reset the number of consecutive "good" scrapes
url.consecutive_successes = 0
# Remove URLs whose mask is deemed done
complete_list.extend(
[url for url in url_list if url.consecutive_successes >= scrapes])
error_list.extend(
[url for url in url_list if url.errors >= errors])
url_list = [
url for url in url_list if
url.consecutive_successes < scrapes and
url.errors < errors]
scrape_pass += 1
print "**Done with scrape pass %d\n" % scrape_pass
if scrape_pass >= command["--giveup"]:
print "**Exceeded giveup threshhold. Giving up."
else:
print "Waiting %d seconds..." % command["--wait"]
time.sleep(command["--wait"])
print
print "*** MASKMAKER COMPLETE ***"
print "Summary report:"
print " %d masks successfully generated" % len(complete_list)
for url in complete_list:
print " ", url.url
print " %d masks failed with too many errors" % len(error_list)
for url in error_list:
print " ", url.url
if scrape_pass >= command["--giveup"]:
print (" %d masks were not completed before "
"reaching the giveup threshhold" % len(url_list))
for url in url_list:
print " ", url.url