Category Archives: Python

Mapbox Visual Center Algorithm: arcpy (attempt #2)

quadtree2

Yesterday, I attempted to replicate the Mapbox Visual Center algorithm using ArcGIS arcpy (Python). Today, I actually looked at the source code (code, license) and attempted to fully port it over from JavaScript. I think it works properly, but you tell me.

def polylabel(polygon, centroid, precision, debug):
    cells = []
    precision = precision or 1.0
    extent = polygon.extent
    minX, minY, maxX, maxY = (extent.XMin, extent.YMin, extent.XMax, extent.YMax)
    width = extent.width
    height = extent.height
    cellSize = min([width, height])
    h = cellSize / 2
    cellQueue = []
    x = minX
    while x < maxX:
        y = minY
        while y < maxY:             cell = Cell(x+h, y+h, h, polygon)             cells.append(cell.geom)             cellQueue.append(cell)             y += cellSize         x += cellSize     bestCell = Cell(centroid[0], centroid[1], 0, polygon)     numProbes = len(cellQueue)     while len(cellQueue):         cellQueue.sort(key=lambda cell: cell.d)         cell = cellQueue.pop()         cells.append(cell.geom)         if cell.d > bestCell.d:
            bestCell = cell
        if cell.max - bestCell.d <= precision:
            continue
        h = cell.h/2
        cellQueue.append(Cell(cell.x-h, cell.y-h, h, polygon))
        cellQueue.append(Cell(cell.x+h, cell.y-h, h, polygon))
        cellQueue.append(Cell(cell.x-h, cell.y+h, h, polygon))
        cellQueue.append(Cell(cell.x+h, cell.y+h, h, polygon))
        numProbes += 4
    return [bestCell.x, bestCell.y, cells]
class Cell():
    def __init__(self, x, y, h, polygon):
        self.x = x
        self.y = y
        self.h = h
        self.d = pointToPolygonDist(x, y, polygon)
        self.max = self.d + self.h * math.sqrt(2)
        self.geom = arcpy.Polygon(arcpy.Array([arcpy.Point(x-h,y-h),arcpy.Point(x+h,y-h),arcpy.Point(x+h,y+h),arcpy.Point(x-h,y+h)]))
def pointToPolygonDist(x, y, polygon):
    point_geom = arcpy.PointGeometry(arcpy.Point(x, y),sr)
    polygon_outline = polygon.boundary()
    min_dist = polygon_outline.queryPointAndDistance(point_geom)
    sign_dist = min_dist[2] * ((min_dist[3]-0.5)*2)
    return sign_dist
fc = 'BEC_POLY selection'
sr = arcpy.Describe(fc).spatialReference
centroids = []
label_points = []
cells = []
with arcpy.da.SearchCursor(fc,['SHAPE@','SHAPE@XY']) as cursor:
    for row in cursor:
        best_point = polylabel(row[0],row[1],.75,'#')
        centroids.append(arcpy.PointGeometry(row[0].centroid,sr))
        label_points.append(arcpy.PointGeometry(arcpy.Point(best_point[0], best_point[1]),sr))
        for cell in best_point[2]:
            cells.append(cell)
arcpy.CopyFeatures_management(centroids,r'in_memory\centroids')
arcpy.CopyFeatures_management(label_points, r'in_memory\label_points')
arcpy.CopyFeatures_management(cells, r'in_memory\cells')

Mapbox Visual Center Algorithm: arcpy

quadtree

On Monday, Mapbox published a JavaScript implementation of a fast algorithm for finding the visual center of a polygon using quadtrees (blog, code). I took a stab at it using Python/arcpy in ArcGIS:

def quadify(poly): # split polygon into quads
    ext = poly.extent
    TM = arcpy.Point((ext.XMin+ext.XMax)/2,ext.YMax)
    LM = arcpy.Point(ext.XMin,(ext.YMin+ext.YMax)/2)
    BM = arcpy.Point((ext.XMin+ext.XMax)/2,ext.YMin)
    RM = arcpy.Point(ext.XMax,(ext.YMin+ext.YMax)/2)
    TL = arcpy.Polygon(arcpy.Array([ext.upperLeft,LM,poly.centroid,TM]),sr)
    TR = arcpy.Polygon(arcpy.Array([TM,poly.centroid,RM,ext.upperRight]),sr)
    BL = arcpy.Polygon(arcpy.Array([LM,ext.lowerLeft,BM,poly.centroid]),sr)
    BR = arcpy.Polygon(arcpy.Array([poly.centroid,BM,ext.lowerRight,RM]),sr)
    return [TL,TR,BL,BR]
def inspect(quad,poly): # calculate quad radius & quad center to polygon distance
    poly_boundary = poly.boundary()
    quad_center = arcpy.PointGeometry(quad.centroid,sr)
    radius = quad_center.distanceTo(quad.firstPoint)
    q_dist = poly_boundary.queryPointAndDistance(quad_center)
    dist = q_dist[2] if q_dist[3] else -q_dist[2]
    return dist + radius
def loop_quads(quads): # evaluate the quads & return if in the top 10% of values
    max_quads = []
    max_dist = 0
    dists = {}
    for quad in quads:
        cur_dist = inspect(quad,row[0])
        dists[cur_dist] = quad
        max_dist = cur_dist if cur_dist > max_dist else max_dist
        out_polys.append(quad)
    for k,v in dists.iteritems():
        if k > max_dist * 0.90: # precision = 90%
            max_quads.append(v)
    return max_quads
fc = 'wetlands_select' # feature class/layer
sr = arcpy.Describe(fc).spatialReference # spatial reference
out_polys = []
out_points = []
with arcpy.da.SearchCursor(fc,'SHAPE@',spatial_reference=sr) as cursor: # loop polygons
    for row in cursor:
        ext = row[0].extent
        center = arcpy.PointGeometry(arcpy.Point((ext.XMin+ext.XMax)/2,(ext.YMin+ext.YMax)/2),sr)
        radius = max(center.distanceTo(ext.upperLeft),center.distanceTo(ext.upperRight),center.distanceTo(ext.lowerLeft),center.distanceTo(ext.lowerRight))
        circle = center.buffer(radius)
        quads = quadify(circle) # start with global circle
        for i in range(10): # quadify 10x
            max_quads = loop_quads(quads) # evaluate current set of quads
            quads = []
            for quad in max_quads:
                quads += quadify(quad)
        out_points.append(arcpy.PointGeometry(quad.centroid,sr)) # return one of the best quad centers
arcpy.CopyFeatures_management(out_polys,r'in_memory\polys')
arcpy.CopyFeatures_management(out_points,r'in_memory\points')

Notes:

I’m not 100% sure that this emulates the base Mapbox algorithm, and I didn’t attempt to implement the “priority queue” enhancement.

The results are somewhat sensitive to the precision factor, but I suppose that’s the be expected with a speed algorithm.

I realize that you can calculate a label point much easier within ArcGIS using out-of-the-box tools. This was just me exploring the algorithm.

Anyhow, I welcome your comments on the code above!

GCHQ Quiz Grinch: 2

***Contains spoilers, obviously.***

As I mentioned previously, the GCHQ (Government Communications Headquarters) released a hilariously infuriating Christmas card, which consisted of a logic puzzle, which once solved, led to more puzzles, which led to more puzzles, and so on. I cheated to complete Part 2. And, here’s how you can cheat to complete Part 4.

Part 4 consists of three series of numbers. These numbers, back to back, create an IP address to access Part 5. Three numbers to make one IP address. Hmmm, an IP address contains four numbers. One of the numbers must be a decimal. So, I’m looking for two integers and one float, but I’m not sure which.

If you look at the source code, there is a hash function that is used to verify that the answers are correct. Basically, it concatenates the three inputs, performs some one-way computations, returns two answers, and checks if they match two other, known numbers. So, you could work out the correct answers (if you’re smart enough), ooooooooor you could try all possible answers (which is what I did).

There are a few ways to constrain the list of possible answers, which will cut down on the computation time. All IP address components are integers between 0 and 255. Also, I took a guess that the first component was between 0-128 – a Class A IP address. The hash function takes 3 inputs, separated by null characters (“\0”). That means I need to separate 4 integers by two null characters and one period (representing a decimal), but I don’t know in which place the decimal should be. There are three possibilities for that.

So, in all, there are 128 * 256 * 256 * 256 * 3 possibilities (first, second, third, fourth IP components, and decimal placements) to inspect. That’s a grand total of 6,442,450,944 combinations to evaluate. Even for Python, that’s a lot of comparisons, but it can be done. It takes about 0.115s to run 128*3 comparisons, about 29.5s to run 128*256*3 comparisons. To complete our six billion comparisons, it will take approximately 22.5 days. Luckily, the final answer is located fairly early in the sequence, so the final run time is about 8.7 days.

Here’s the script I used:

import ctypes, sys, cProfile

def hsh(dat):
    resultA = 3141592654
    resultB = 1234567890
    for i in range(2):
        initA = resultA
        initB = resultB
        for j in range(len(dat)):
            resultA += ord(dat.lower()[j])
            resultB = ctypes.c_int((resultA * 31) ^ resultB).value
            tmp = resultA & resultA
            resultA = resultB & resultB
            resultB = tmp
        resultA = ctypes.c_int(resultA ^ initA).value
        resultB = ctypes.c_int(resultB ^ initB).value
    return [resultA, resultB]

dot_config = [['.','\0','\0'],['\0','.','\0'],['\0','\0','.']]

def findIP():
    for x in range(128):
        for y in range(256):
            for z in range(256):
                for a in range(256):
                    for dots in dot_config:    
                        res = hsh("".join([str(x), dots[0], str(y), dots[1], str(z), dots[2], str(a)]))
                        if (res[0]==1824745082) and (res[1]==560037081):
                            print ([str(x), dots[0], str(y), dots[1], str(z), dots[2], str(a)])
                            sys.exit()
            print (x, y, z, a, res[0], res[1])
    
cProfile.runctx('findIP()',None,locals())

 

And, the final answers: 52, 30.87, 208

The quiz is for charity, so don’t forget to donate here.

GCHQ Quiz Grinch: 1

***Contains spoilers, obviously.***

The GCHQ (Government Communications Headquarters) released a hilariously infuriating Christmas card, which consisted of a logic puzzle, which once solved, led to more puzzles, which led to more puzzles, and so on.

I solved the first puzzle no problem. That revealed Part 2, a series of 6 multiple choice questions, all of which required a correct answer to move on. I got two of them, but was left at a loss for the other four. I gave it an honest try, but really wanted to see what Part 3 had in store. So, I turned to cheating. And, here’s how.

Clicking through the answers, you may notice that the URL contains the previous answer letter. For example, if your answers were A, B, C, D, E, and F, the final webpage (telling you that you were wrong) would have the URL: http://s3-eu-west-1.amazonaws.com/puzzleinabucket/ABCDEF.html

If you had enough time, you could type in each combination and see if that led to a “correct” URL. Obviously, that would be fairly time-consuming. Each of 6 questions had 6 possible answers, which means 6 ^ 6 = 46656 possibilities. You could save some time by including known correct answers. In my case, I had two answers of which I was confident. That left me with 6 ^ 4 = 1296 possibilities, still too many to inspect manually.

However, a simple Python script could load the pages fairly quickly (about once per second, or a maximum of 21.6 minutes), inspect the resulting webpage, and determine if it contains a message indicating that the combination was incorrect. If so, move onto next combination. If not, stop and print the correct combination.

Here’s the script:

import requests, time, sys
from lxml import html

opts = ['A','B','C','D','E','F'] # possible answers
for i in opts:
    for j in opts:
        for k in opts:
            for l in opts:
                url = i + j + k + 'A' + l + 'E' # 4 unknown, 2 known
                time.sleep(1) # provide rest in between calls
                
                r = requests.get('http://s3-eu-west-1.amazonaws.com/puzzleinabucket/' + url + '.html') # make call
                tree = html.fromstring(r.text) # store page html
    
                if tree.xpath('//div[contains(., "Sorry - you did not get all the questions correct.")]/p//text()'):
                    pass
                else:
                    print "SUCCESS " + url
                    sys.exit() # quit script once correct combination is found

 

And the final answer: DEFACE

The quiz is for charity, so don’t forget to donate here.

 

GeoNet Prime Answering Times

geonet_primetimes

ESRI runs a biannual contest to encourage participation on their help forum, GeoNet. Prizes go to the top ten point-getters. I found out during the last contest that it takes a great deal of time and persistence to keep up with the top of the pack (I ended up 5th). There are several tips for optimizing your effort (some of which I outlined, somewhat sarcastically, here).

One such tip is: get on GeoNet during times when there are the greatest number of fresh, unanswered questions. If you’ve spent any time on GeoNet, you have likely noticed that questions are generally asked during North American working hours, when people are struggling to get through their work-related GIS tasks. I wanted to put some better numbers to this idea, so I set about gathering the data myself.

All the information is there: each post has the date/time it was asked written right there in the posting. You could click on each post and record that date/time into an Excel and be done with it, but that would be awfully tedious. This is where screen scraping comes in. Screen scraping is the direct equivalent of having your computer control your web browser: click here, find this part of the HTML code, read it, and do something with it.Luckily, your computer doesn’t care if it has to spend all day doing the same thing over and over and over…

I chose to use Python, but you can do this in other languages, as well. Useful libraries to download are Requests and lxml. I use Requests for making the, you guessed it, “requests”, which are similar to typing a URL in the address bar of your browser. I use lxml for parsing and traversing the returned HTML code, which you can look at on any web page by pressing Ctrl+u (at least, in Chrome).

from lxml import html
import requests, time, csv

with open('C:/junk/geonet.csv', 'w') as csvfile: # create and/or open a CSV file
  csvWriter = csv.writer(csvfile, delimiter=" ", quoting=csv.QUOTE_MINIMAL) # writer
  dateList = []
  baseUrl = 'https://geonet.esri.com/content' # store the URL prefix

  for i in range(10): # loop through the first 10 'Content' pages
    page = requests.get(baseUrl + '?start=' + str(i*20)) # navigate to page
    tree = html.fromstring(page.text) # retrieve the HTML
    linkList = tree.iterlinks() # find all the links on the page
    threads = []

    for link in linkList: # loop through the links
      if link[2].startswith('/thread/'): # find those starting with "thread"
        threads.append(link[2]) # add the link to the list

    threadBase = 'https://geonet.esri.com' # store the URL prefix
    for thread in threads: # loop through the threads listed on the 'Content' page
      page = requests.get(threadBase + thread) # navigate to the correct thread page
      tree = html.fromstring(page.text) # retrieve the HTML
      dates = tree.find_class('j-post-author') # retrieve the date
      dateList.append(dates[0].text_content().strip()) # write to list
      csvWriter.writerow(dates[0].text_content().strip()) # write to CSV
      time.sleep(5) # wait 5s to give server a chance to handle someone else's requests

Anyhow, the graph at the start of this post shows pretty much what I expected: people on the East Coast get confused, then people on the West Coast get confused, then everyone goes home.

Word tables to Excel: Python

Question: I’ve got a folder of 100+ Word documents, each containing a table or two. Can you format that into an Excel spreadsheet for me?

Answer: Yes – bask in the power and glory of Python.

Using the Python for Windows Extension (specifically the win32com.client library), you can drive both Word and Excel with Python. Knowing that this functionality exists is half the battle.

# import libraries
import win32com.client as win32
import os

myDir = r'C:\Projects\ProjectX'

# open invisible Excel app
XL = win32.Dispatch('Excel.Application')
XL.Visible = 0
# load pre-made workbook
XLbook = XL.Workbooks.Open(os.path.join(myDir,'ProjectX_Spreadsheet.xlsx'))
# navigate to first worksheet
XLsheet = XLbook.Worksheets(1)
# counter to keep track of Excel row
XLrow = 2

# loop through files in directory
for myFile in os.listdir(myDir):
 filepath = os.path.join(myDir,myFile)
 filename = os.path.splitext(myFile)[0]
 ext = os.path.splitext(myFile)[1]

 # check if *.docx {optional}
 if ext == '.docx':
 # open invisible Word app
 word = win32.Dispatch('Word.Application')
 word.Visible = 0
 # open Word doc to read
 word.Documents.Open(filepath)
 doc = word.ActiveDocument

 # access first table in Word doc. For subsequent tables, increase index.
 table = doc.Tables(1)

 # get (Word) and set (Excel) some data
 XLsheet.Cells(XLrow,1).Value = table.Cell(Row=1, Column=1).Range.Text

 # get (Word) and set (Excel) some more data
 XLsheet.Cells(XLrow,2).Value = table.Cell(Row=2, Column=3).Range.Text

 # move to next row
 XLrow = XLrow + 1
 # close the current Word doc
 doc.Close()

# exit the Word app
word.Quit()
del word
# save and close Excel app
XLbook.Close(True)
XL.Quit()
del XL

Arcpy: Polygons to Centroids (within Polygons, and with all Attributes)

Here’s a fairly simple python script that creates centroid points (constrained to fall within polygons) and attaches all the attributes (plus a link field, “ORIG_ID”) from the polygons to the centroids. Notably, it uses cursors to read raw geometry objects – no geoprocessing tools (except CreateFeatureclass). Also, while Advanced licensees may use the Feature to Point tool for this procedure, the method presented here is completely free, using no extensions or non-basic licensing – I don’t know of another free way to achieve this in ArcGIS. Wouldn’t it be easier to run a spatial join on the points to attach the attributes? Yes, but spatial join is slow. In my test of 1000 polygons, the cursor method completed in about 2 seconds, while adding a spatial join call extended the run time to more than 10 seconds. This could mean big savings on large datasets.

  1. # import libraries
  2. import arcpy, os
  3. # set input/output parameters
  4. polyFC = arcpy.GetParameterAsText(0)
  5. outCentroids = arcpy.GetParameterAsText(1)
  6. # set overwrite environment
  7. arcpy.env.overwriteOutput = True
  8. # if the output file does not exist, create it. Add “ORIG_ID” field.
  9. if not arcpy.Exists(outCentroids):
  10.     arcpy.CreateFeatureclass_management(os.path.dirname(outCentroids),
  11.                                     os.path.basename(outCentroids),
  12.                                     “POINT”,
  13.                                     polyFC,
  14.                                     “”,
  15.                                     “”,
  16.                                     polyFC)
  17.     arcpy.AddField_management(outCentroids,‘ORIG_ID’‘LONG’)
  18. # create an InsertCursor containing all the fields in ourCentroids, which are all the fields in polyFC plus “ORIG_ID”
  19. iCursor = arcpy.da.InsertCursor(outCentroids, [‘*’])
  20. # read all features in polyFC
  21. with arcpy.da.SearchCursor(polyFC, [“SHAPE@”,‘*’,‘OID@’]) as sCursor:
  22.     for row in sCursor:
  23.          # create an array to hold a new, modified row
  24.         rowArray = []
  25.          # read all the fields except “SHAPE@”
  26.         for fieldnum in range(1,len(row)):
  27.             # if this is the 2nd field [i.e. SHAPE], replace it with the centroid coordinates
  28.             if fieldnum == 2:
  29.                 rowArray.append(row[0].centroid)
  30.             else:
  31.                 rowArray.append(row[fieldnum])
  32.         # write the new row to the cursor
  33.         iCursor.insertRow(rowArray)
  34. del iCursor

Looping through the fields in this way (lines 29 – 36) feels cumbersome – if anyone knows a better way to do this, please let me know!

————————————————-

EDIT (January 19, 2016): Following some discussion on LinkedIn’s GIS and Geography group regarding this post, here is an example of why you must use the SHAPE@ token (and Polygon.centroid) rather than either the SHAPE@XY or SHAPE@TRUECENTROID tokens to guarantee that the point falls inside the given polygon:

… points = []
… with arcpy.da.SearchCursor(“MY_FEATURE_LAYER”,[‘SHAPE@’,’Shape@XY’,’SHAPE@TRUECENTROID’]) as cursor:
…   for row in cursor:
…     points.append(arcpy.PointGeometry(row[0].centroid)) # SHAPE@
…     points.append(arcpy.PointGeometry(arcpy.Point(row[1][0],row[1][1]))) # SHAPE@XY
…     points.append(arcpy.PointGeometry(arcpy.Point(row[2][0],row[2][1]))) # SHAPE@TRUECENTROID
… arcpy.CopyFeatures_management(points, r’in_memory\mypoints’)

centroid_tokens