shaka-packager/tools/site_compare/commands/maskmaker.py

# Copyright (c) 2011 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Component for automatically creating masks of changing areas of a website.

Works by repeated invokation of a browser and scraping of the resulting page.
Areas that differ will be added to the auto-generated mask. The mask generator
considers the mask complete when further scrapes fail to produce any differences
in the mask.
"""

import os            # Functions for walking the directory tree
import tempfile      # Get a temporary directory to hold intermediates
import time          # Used for sleep() and naming masks by time

import command_line
import drivers
from PIL import Image
from PIL import ImageChops
import scrapers


def CreateCommand(cmdline):
  """Inserts the command and arguments into a command line for parsing."""
  cmd = cmdline.AddCommand(
    ["maskmaker"],
    "Automatically generates a mask from a list of URLs",
    ValidateMaskmaker,
    ExecuteMaskmaker)

  cmd.AddArgument(
    ["-bp", "--browserpath"], "Full path to browser's executable",
    type="readfile", metaname="PATH")
  cmd.AddArgument(
    ["-b", "--browser"], "Which browser to use", type="string",
    default="chrome")
  cmd.AddArgument(
    ["-bv", "--browserver"], "Version of the browser", metaname="VERSION")
  cmd.AddArgument(
    ["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",
    required=True)
  cmd.AddArgument(
    ["-u", "--url"], "URL to compare")
  cmd.AddArgument(
    ["-l", "--list"], "List of URLs to compare", type="readfile")
  cmd.AddMutualExclusion(["--url", "--list"])
  cmd.AddArgument(
    ["-s", "--startline"], "First line of URL list", type="int")
  cmd.AddArgument(
    ["-e", "--endline"], "Last line of URL list (exclusive)", type="int")
  cmd.AddArgument(
    ["-c", "--count"], "Number of lines of URL file to use", type="int")
  cmd.AddDependency("--startline", "--list")
  cmd.AddRequiredGroup(["--url", "--list"])
  cmd.AddDependency("--endline", "--list")
  cmd.AddDependency("--count", "--list")
  cmd.AddMutualExclusion(["--count", "--endline"])
  cmd.AddDependency("--count", "--startline")
  cmd.AddArgument(
    ["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "
    "finish loading",
    type="int", default=60)
  cmd.AddArgument(
    ["-w", "--wait"],
    "Amount of time (in seconds) to wait between successive scrapes",
    type="int", default=60)
  cmd.AddArgument(
    ["-sc", "--scrapes"],
    "Number of successive scrapes which must result in no change to a mask "
    "before mask creation is considered complete", type="int", default=10)
  cmd.AddArgument(
    ["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")
  cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")
  cmd.AddArgument(
    ["-gu", "--giveup"],
    "Number of times to scrape before giving up", type="int", default=50)
  cmd.AddArgument(
    ["-th", "--threshhold"],
    "Percentage of different pixels (0-100) above which the scrape will be"
    "discarded and the mask not updated.", type="int", default=100)
  cmd.AddArgument(
    ["--er", "--errors"],
    "Number of times a scrape can fail before giving up on the URL.",
    type="int", default=1)


def ValidateMaskmaker(command):
  """Validate the arguments to maskmaker. Raises ParseError if failed."""
  executables = [".exe", ".com", ".bat"]
  if command["--browserpath"]:
    if os.path.splitext(command["--browserpath"])[1].lower() not in executables:
      raise command_line.ParseError("Browser filename must be an executable")


def ExecuteMaskmaker(command):
  """Performs automatic mask generation."""

  # Get the list of URLs to generate masks for
  class MaskmakerURL(object):
    """Helper class for holding information about a URL passed to maskmaker."""
    __slots__ = ['url', 'consecutive_successes', 'errors']
    def __init__(self, url):
      self.url = url
      self.consecutive_successes = 0
      self.errors = 0

  if command["--url"]:
    url_list = [MaskmakerURL(command["--url"])]
  else:
    startline = command["--startline"]
    if command["--count"]:
      endline = startline+command["--count"]
    else:
      endline = command["--endline"]
    url_list = [MaskmakerURL(url.strip()) for url in
                open(command["--list"], "r").readlines()[startline:endline]]

  complete_list = []
  error_list = []

  outdir = command["--outdir"]
  scrapes = command["--scrapes"]
  errors = command["--errors"]
  size = command["--size"]
  scrape_pass = 0

  scrapedir = command["--scrapedir"]
  if not scrapedir: scrapedir = tempfile.gettempdir()

  # Get the scraper
  scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))

  # Repeatedly iterate through the list of URLs until either every URL has
  # a successful mask or too many errors, or we've exceeded the giveup limit
  while url_list and scrape_pass < command["--giveup"]:
    # Scrape each URL
    for url in url_list:
      print "Processing %r..." % url.url
      mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")

      # Load the existing mask. This is in a loop so we can try to recover
      # from error conditions
      while True:
        try:
          mask = Image.open(mask_filename)
          if mask.size != size:
            print "  %r already exists and is the wrong size! (%r vs %r)" % (
              mask_filename, mask.size, size)
            mask_filename = "%s_%r%s" % (
              mask_filename[:-4], size, mask_filename[-4:])
            print "  Trying again as %r..." % mask_filename
            continue
          break
        except IOError:
          print "  %r does not exist, creating" % mask_filename
          mask = Image.new("1", size, 1)
          mask.save(mask_filename)

      # Find the stored scrape path
      mask_scrape_dir = os.path.join(
        scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])
      drivers.windowing.PreparePath(mask_scrape_dir)

      # Find the baseline image
      mask_scrapes = os.listdir(mask_scrape_dir)
      mask_scrapes.sort()

      if not mask_scrapes:
        print "  No baseline image found, mask will not be updated"
        baseline = None
      else:
        baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))

      mask_scrape_filename = os.path.join(mask_scrape_dir,
                                          time.strftime("%y%m%d-%H%M%S.bmp"))

      # Do the scrape
      result = scraper.Scrape(
        [url.url], mask_scrape_dir, size, (0, 0),
        command["--timeout"], path=command["--browserpath"],
        filename=mask_scrape_filename)

      if result:
        # Return value other than None means an error
        print "  Scrape failed with error '%r'" % result
        url.errors += 1
        if url.errors >= errors:
          print "  ** Exceeded maximum error count for this URL, giving up"
        continue

      # Load the new scrape
      scrape = Image.open(mask_scrape_filename)

      # Calculate the difference between the new scrape and the baseline,
      # subject to the current mask
      if baseline:
        diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),
                                   mask.convert(scrape.mode))

        # If the difference is none, there's nothing to update
        if max(diff.getextrema()) == (0, 0):
          print "  Scrape identical to baseline, no change in mask"
          url.consecutive_successes += 1
          if url.consecutive_successes >= scrapes:
            print "  ** No change for %r scrapes, done!" % scrapes
        else:
          # convert the difference to black and white, then change all
          # black pixels (where the scrape and the baseline were identical)
          # to white, all others (where the scrape and the baseline differed)
          # to black.
          #
          # Since the below command is a little unclear, here's how it works.
          #    1. convert("L") converts the RGB image to grayscale
          #    2. point() maps grayscale values (or the individual channels)
          #       of an RGB image) to different ones. Because it operates on
          #       individual channels, the grayscale conversion from step 1
          #       is necessary.
          #    3. The "1" second parameter to point() outputs the result as
          #       a monochrome bitmap. If the original RGB image were converted
          #       directly to monochrome, PIL would dither it.
          diff = diff.convert("L").point([255]+[0]*255, "1")

          # count the number of different pixels
          diff_pixels = diff.getcolors()[0][0]

          # is this too much?
          diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])
          if diff_pixel_percent > command["--threshhold"]:
            print ("  Scrape differed from baseline by %.2f percent, ignoring"
                   % diff_pixel_percent)
          else:
            print "  Scrape differed in %d pixels, updating mask" % diff_pixels
            mask = ImageChops.multiply(mask, diff)
            mask.save(mask_filename)

            # reset the number of consecutive "good" scrapes
            url.consecutive_successes = 0

    # Remove URLs whose mask is deemed done
    complete_list.extend(
      [url for url in url_list if url.consecutive_successes >= scrapes])
    error_list.extend(
      [url for url in url_list if url.errors >= errors])
    url_list = [
      url for url in url_list if
      url.consecutive_successes < scrapes and
      url.errors < errors]

    scrape_pass += 1
    print "**Done with scrape pass %d\n" % scrape_pass

    if scrape_pass >= command["--giveup"]:
      print "**Exceeded giveup threshhold. Giving up."
    else:
      print "Waiting %d seconds..." % command["--wait"]
      time.sleep(command["--wait"])

  print
  print "*** MASKMAKER COMPLETE ***"
  print "Summary report:"
  print "  %d masks successfully generated" % len(complete_list)
  for url in complete_list:
    print "    ", url.url
  print "  %d masks failed with too many errors" % len(error_list)
  for url in error_list:
    print "    ", url.url
  if scrape_pass >= command["--giveup"]:
    print ("  %d masks were not completed before "
           "reaching the giveup threshhold" % len(url_list))
    for url in url_list:
      print "    ", url.url
Start with media/mp4, media/webm and base codes from Chromium. 2013-09-24 01:35:40 +00:00			`# Copyright (c) 2011 The Chromium Authors. All rights reserved.`
			`# Use of this source code is governed by a BSD-style license that can be`
			`# found in the LICENSE file.`

			`"""Component for automatically creating masks of changing areas of a website.`

			`Works by repeated invokation of a browser and scraping of the resulting page.`
			`Areas that differ will be added to the auto-generated mask. The mask generator`
			`considers the mask complete when further scrapes fail to produce any differences`
			`in the mask.`
			`"""`

			`import os # Functions for walking the directory tree`
			`import tempfile # Get a temporary directory to hold intermediates`
			`import time # Used for sleep() and naming masks by time`

			`import command_line`
			`import drivers`
			`from PIL import Image`
			`from PIL import ImageChops`
			`import scrapers`


			`def CreateCommand(cmdline):`
			`"""Inserts the command and arguments into a command line for parsing."""`
			`cmd = cmdline.AddCommand(`
			`["maskmaker"],`
			`"Automatically generates a mask from a list of URLs",`
			`ValidateMaskmaker,`
			`ExecuteMaskmaker)`

			`cmd.AddArgument(`
			`["-bp", "--browserpath"], "Full path to browser's executable",`
			`type="readfile", metaname="PATH")`
			`cmd.AddArgument(`
			`["-b", "--browser"], "Which browser to use", type="string",`
			`default="chrome")`
			`cmd.AddArgument(`
			`["-bv", "--browserver"], "Version of the browser", metaname="VERSION")`
			`cmd.AddArgument(`
			`["-o", "--outdir"], "Directory to store generated masks", metaname="DIR",`
			`required=True)`
			`cmd.AddArgument(`
			`["-u", "--url"], "URL to compare")`
			`cmd.AddArgument(`
			`["-l", "--list"], "List of URLs to compare", type="readfile")`
			`cmd.AddMutualExclusion(["--url", "--list"])`
			`cmd.AddArgument(`
			`["-s", "--startline"], "First line of URL list", type="int")`
			`cmd.AddArgument(`
			`["-e", "--endline"], "Last line of URL list (exclusive)", type="int")`
			`cmd.AddArgument(`
			`["-c", "--count"], "Number of lines of URL file to use", type="int")`
			`cmd.AddDependency("--startline", "--list")`
			`cmd.AddRequiredGroup(["--url", "--list"])`
			`cmd.AddDependency("--endline", "--list")`
			`cmd.AddDependency("--count", "--list")`
			`cmd.AddMutualExclusion(["--count", "--endline"])`
			`cmd.AddDependency("--count", "--startline")`
			`cmd.AddArgument(`
			`["-t", "--timeout"], "Amount of time (seconds) to wait for browser to "`
			`"finish loading",`
			`type="int", default=60)`
			`cmd.AddArgument(`
			`["-w", "--wait"],`
			`"Amount of time (in seconds) to wait between successive scrapes",`
			`type="int", default=60)`
			`cmd.AddArgument(`
			`["-sc", "--scrapes"],`
			`"Number of successive scrapes which must result in no change to a mask "`
			`"before mask creation is considered complete", type="int", default=10)`
			`cmd.AddArgument(`
			`["-sz", "--size"], "Browser window size", default=(800, 600), type="coords")`
			`cmd.AddArgument(["-sd", "--scrapedir"], "Directory to store scrapes")`
			`cmd.AddArgument(`
			`["-gu", "--giveup"],`
			`"Number of times to scrape before giving up", type="int", default=50)`
			`cmd.AddArgument(`
			`["-th", "--threshhold"],`
			`"Percentage of different pixels (0-100) above which the scrape will be"`
			`"discarded and the mask not updated.", type="int", default=100)`
			`cmd.AddArgument(`
			`["--er", "--errors"],`
			`"Number of times a scrape can fail before giving up on the URL.",`
			`type="int", default=1)`


			`def ValidateMaskmaker(command):`
			`"""Validate the arguments to maskmaker. Raises ParseError if failed."""`
			`executables = [".exe", ".com", ".bat"]`
			`if command["--browserpath"]:`
			`if os.path.splitext(command["--browserpath"])[1].lower() not in executables:`
			`raise command_line.ParseError("Browser filename must be an executable")`


			`def ExecuteMaskmaker(command):`
			`"""Performs automatic mask generation."""`

			`# Get the list of URLs to generate masks for`
			`class MaskmakerURL(object):`
			`"""Helper class for holding information about a URL passed to maskmaker."""`
			`__slots__ = ['url', 'consecutive_successes', 'errors']`
			`def __init__(self, url):`
			`self.url = url`
			`self.consecutive_successes = 0`
			`self.errors = 0`

			`if command["--url"]:`
			`url_list = [MaskmakerURL(command["--url"])]`
			`else:`
			`startline = command["--startline"]`
			`if command["--count"]:`
			`endline = startline+command["--count"]`
			`else:`
			`endline = command["--endline"]`
			`url_list = [MaskmakerURL(url.strip()) for url in`
			`open(command["--list"], "r").readlines()[startline:endline]]`

			`complete_list = []`
			`error_list = []`

			`outdir = command["--outdir"]`
			`scrapes = command["--scrapes"]`
			`errors = command["--errors"]`
			`size = command["--size"]`
			`scrape_pass = 0`

			`scrapedir = command["--scrapedir"]`
			`if not scrapedir: scrapedir = tempfile.gettempdir()`

			`# Get the scraper`
			`scraper = scrapers.GetScraper((command["--browser"], command["--browserver"]))`

			`# Repeatedly iterate through the list of URLs until either every URL has`
			`# a successful mask or too many errors, or we've exceeded the giveup limit`
			`while url_list and scrape_pass < command["--giveup"]:`
			`# Scrape each URL`
			`for url in url_list:`
			`print "Processing %r..." % url.url`
			`mask_filename = drivers.windowing.URLtoFilename(url.url, outdir, ".bmp")`

			`# Load the existing mask. This is in a loop so we can try to recover`
			`# from error conditions`
			`while True:`
			`try:`
			`mask = Image.open(mask_filename)`
			`if mask.size != size:`
			`print " %r already exists and is the wrong size! (%r vs %r)" % (`
			`mask_filename, mask.size, size)`
			`mask_filename = "%s_%r%s" % (`
			`mask_filename[:-4], size, mask_filename[-4:])`
			`print " Trying again as %r..." % mask_filename`
			`continue`
			`break`
			`except IOError:`
			`print " %r does not exist, creating" % mask_filename`
			`mask = Image.new("1", size, 1)`
			`mask.save(mask_filename)`

			`# Find the stored scrape path`
			`mask_scrape_dir = os.path.join(`
			`scrapedir, os.path.splitext(os.path.basename(mask_filename))[0])`
			`drivers.windowing.PreparePath(mask_scrape_dir)`

			`# Find the baseline image`
			`mask_scrapes = os.listdir(mask_scrape_dir)`
			`mask_scrapes.sort()`

			`if not mask_scrapes:`
			`print " No baseline image found, mask will not be updated"`
			`baseline = None`
			`else:`
			`baseline = Image.open(os.path.join(mask_scrape_dir, mask_scrapes[0]))`

			`mask_scrape_filename = os.path.join(mask_scrape_dir,`
			`time.strftime("%y%m%d-%H%M%S.bmp"))`

			`# Do the scrape`
			`result = scraper.Scrape(`
			`[url.url], mask_scrape_dir, size, (0, 0),`
			`command["--timeout"], path=command["--browserpath"],`
			`filename=mask_scrape_filename)`

			`if result:`
			`# Return value other than None means an error`
			`print " Scrape failed with error '%r'" % result`
			`url.errors += 1`
			`if url.errors >= errors:`
			`print " ** Exceeded maximum error count for this URL, giving up"`
			`continue`

			`# Load the new scrape`
			`scrape = Image.open(mask_scrape_filename)`

			`# Calculate the difference between the new scrape and the baseline,`
			`# subject to the current mask`
			`if baseline:`
			`diff = ImageChops.multiply(ImageChops.difference(scrape, baseline),`
			`mask.convert(scrape.mode))`

			`# If the difference is none, there's nothing to update`
			`if max(diff.getextrema()) == (0, 0):`
			`print " Scrape identical to baseline, no change in mask"`
			`url.consecutive_successes += 1`
			`if url.consecutive_successes >= scrapes:`
			`print " ** No change for %r scrapes, done!" % scrapes`
			`else:`
			`# convert the difference to black and white, then change all`
			`# black pixels (where the scrape and the baseline were identical)`
			`# to white, all others (where the scrape and the baseline differed)`
			`# to black.`
			`#`
			`# Since the below command is a little unclear, here's how it works.`
			`# 1. convert("L") converts the RGB image to grayscale`
			`# 2. point() maps grayscale values (or the individual channels)`
			`# of an RGB image) to different ones. Because it operates on`
			`# individual channels, the grayscale conversion from step 1`
			`# is necessary.`
			`# 3. The "1" second parameter to point() outputs the result as`
			`# a monochrome bitmap. If the original RGB image were converted`
			`# directly to monochrome, PIL would dither it.`
			`diff = diff.convert("L").point([255]+[0]*255, "1")`

			`# count the number of different pixels`
			`diff_pixels = diff.getcolors()[0][0]`

			`# is this too much?`
			`diff_pixel_percent = diff_pixels * 100.0 / (mask.size[0]*mask.size[1])`
			`if diff_pixel_percent > command["--threshhold"]:`
			`print (" Scrape differed from baseline by %.2f percent, ignoring"`
			`% diff_pixel_percent)`
			`else:`
			`print " Scrape differed in %d pixels, updating mask" % diff_pixels`
			`mask = ImageChops.multiply(mask, diff)`
			`mask.save(mask_filename)`

			`# reset the number of consecutive "good" scrapes`
			`url.consecutive_successes = 0`

			`# Remove URLs whose mask is deemed done`
			`complete_list.extend(`
			`[url for url in url_list if url.consecutive_successes >= scrapes])`
			`error_list.extend(`
			`[url for url in url_list if url.errors >= errors])`
			`url_list = [`
			`url for url in url_list if`
			`url.consecutive_successes < scrapes and`
			`url.errors < errors]`

			`scrape_pass += 1`
			`print "**Done with scrape pass %d\n" % scrape_pass`

			`if scrape_pass >= command["--giveup"]:`
			`print "**Exceeded giveup threshhold. Giving up."`
			`else:`
			`print "Waiting %d seconds..." % command["--wait"]`
			`time.sleep(command["--wait"])`

			`print`
			`print "* MASKMAKER COMPLETE *"`
			`print "Summary report:"`
			`print " %d masks successfully generated" % len(complete_list)`
			`for url in complete_list:`
			`print " ", url.url`
			`print " %d masks failed with too many errors" % len(error_list)`
			`for url in error_list:`
			`print " ", url.url`
			`if scrape_pass >= command["--giveup"]:`
			`print (" %d masks were not completed before "`
			`"reaching the giveup threshhold" % len(url_list))`
			`for url in url_list:`
			`print " ", url.url`