Crowdsourcing with Django

Crowdsourcingwith DjangoEuroPython, 30th June 2009

Simon Willison · http://simonwillison.net/ · @simonw

http://simonwillison.net

http://simonwillison.net

http://twitter.com/simonw

http://twitter.com/simonw

“Web development on journalism deadlines”

The back story...

November 2000The Freedom of Information Act

• http://www.guardian.co.uk/politics/2009/may/08/mps-expenses-telegraph-checquebook-journalism

• http://www.guardian.co.uk/politics/2009/may/15/mps-expenses-heather-brooke-foi

Heather Brooke

http://www.guardian.co.uk/politics/2009/may/08/mps-expenses-telegraph-checquebook-journalism






http://www.guardian.co.uk/politics/2009/may/15/mps-expenses-heather-brooke-foi






2004The request

January 2005The FOI request

July 2006The FOI commissioner

May 2007The FOI (Amendment) Bill

February 2008The Information Tribunal

“Transparency will damage democracy”

May 2008The high court

January 2009The exemption law

March 2009The mole

“All of the receipts of 650-odd MPs, redacted and unredacted, are for sale at a price of £300,000, so I am told. The price is going up because of the

interest in the subject.”Sir Stuart Bell, MP

Newsnight, 30th March

8th May, 2009The Daily Telegraph

At the Guardian...

April: “Expenses are due out in a couple of months, is

there anything we can do?”

June: “Expenses have been bumped forward, they’re out

next week!”

Thursday 11th JuneThe proof-of-concept

Monday 15th JuneThe tentative go-ahead

Tuesday 16th JuneDesigner + client-side engineer

Wednesday 17th JuneOperations engineer

Thursday 18th JuneLaunch day!

How we built it

$ convert Frank_Comm.pdf pages.png

Models

class Party(models.Model): name = models.CharField(max_length=100)

class Constituency(models.Model): name = models.CharField(max_length=100)

class MP(models.Model): name = models.CharField(max_length=100) party = models.ForeignKey(Party) constituency = models.ForeignKey(Constituency) guardian_url = models.CharField(max_length=255, blank=True) guardian_image_url = models.CharField(max_length=255, blank=True)

class FinancialYear(models.Model): name = models.CharField(max_length=10)

class Document(models.Model): title = models.CharField(max_length=100, blank=True) filename = models.CharField(max_length=100) mp = models.ForeignKey(MP) financial_year = models.ForeignKey(FinancialYear)

class Page(models.Model): document = models.ForeignKey(Document) page_number = models.IntegerField()

class User(models.Model): created = models.DateTimeField(auto_now_add = True) username = models.TextField(max_length = 100) password_hash = models.CharField(max_length = 128, blank=True)

class LineItemCategory(models.Model): order = models.IntegerField(default = 0) name = models.CharField(max_length = 32)

class LineItem(models.Model): user = models.ForeignKey(User) page = models.ForeignKey(Page) type = models.CharField(max_length = 16, choices = ( ('claim', 'claim'), ('proof', 'proof'), ), db_index = True) date = models.DateField(null = True, blank = True) amount = models.DecimalField(max_digits=20, decimal_places=2) description = models.CharField(max_length = 255, blank = True) created = models.DateTimeField(auto_now_add = True, db_index = True) categories = models.ManyToManyField(LineItemCategory, blank=True)

class Vote(models.Model): user = models.ForeignKey(User, related_name = 'votes') page = models.ForeignKey(Page, related_name = 'votes') obsolete = models.BooleanField(default = False) vote_type = models.CharField(max_length = 32, blank = True) ip_address = models.CharField(max_length = 32) created = models.DateTimeField(auto_now_add = True)

class TypeVote(Vote): type = models.CharField(max_length = 10, choices = ( ('claim', 'Claim'), ('proof', 'Proof'), ('blank', 'Blank'), ('other', 'Other') ))

class InterestingVote(Vote): status = models.CharField(max_length = 10, choices = ( ('no', 'Not interesting'), ('yes', 'Interesting'), ('known', 'Interesting but known'), ('very', 'Investigate this!'), ))

Frictionless registration

Page filters

page_filters = ( # Maps name of filter to dictionary of kwargs to doc.pages.filter() ('reviewed', { 'votes__isnull': False }), ('unreviewed', { 'votes__isnull': True }), ('with line items', { 'line_items__isnull': False }), ('interesting', { 'votes__interestingvote__status': 'yes' }), ('interesting but known', { 'votes__interestingvote__status': 'known'...)page_filters_lookup = dict(page_filters)

pages = doc.pages.all() if page_filter: kwargs = page_filters_lookup.get(page_filter) if kwargs is None: raise Http404, 'Invalid page filter: %s' % page_filter pages = pages.filter(**kwargs).distinct() # Build the filters filters = [] for name, kwargs in page_filters: filters.append({ 'name': name, 'count': doc.pages.filter(**kwargs).distinct().count(), })

Matching names

http://github.com/simonw/datamatcher



On the day

def get_mp_pages(): "Returns list of (mp-name, mp-page-url) tuples" soup = Soup(urllib.urlopen(INDEX_URL)) mp_links = [] for link in soup.findAll('a'): if link.get('title', '').endswith("'s allowances"): mp_links.append( (link['title'].replace("'s allowances", ''), link['href']) ) return mp_links

def get_pdfs(mp_url): "Returns list of (description, years, pdf-url, size) tuples" soup = Soup(urllib.urlopen(mp_url)) pdfs = [] trs = soup.findAll('tr')[1:] # Skip the first, it's the table header for tr in trs: name_td, year_td, pdf_td = tr.findAll('td') name = name_td.string year = year_td.string pdf_url = pdf_td.find('a')['href'] size = pdf_td.find('a').contents[-1].replace('(', '').replace(')', '') pdfs.append( (name, year, pdf_url, size) ) return pdfs

“Drop Everything”

Photoshop + AppleScriptv.s.

Java + IntelliJ

Images on our docroot (S3 upload was taking too long)

Blitz QA

Launch! (on EC2)

Crash #1: more Apache children than MySQL connections

unreviewed_count = Page.objects.filter( votes__isnull = True).distinct().count()

SELECT COUNT(DISTINCT èxpenses_page`.ìd`)FROM èxpenses_page` LEFT OUTER JOIN èxpenses_vote` ON ( èxpenses_page`.ìd` = èxpenses_vote`.`page_id` ) WHERE èxpenses_vote`.ìd` IS NULL

unreviewed_count = cache.get('homepage:unreviewed_count')if unreviewed_count is None: unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count() cache.set('homepage: unreviewed_count', unreviewed_count, 60)

• With 70,000 pages and a LOT of votes...

• DB takes up 135% of CPU

• Cache the count in memcached...

• DB drops to %35 of CPU

unreviewed_count = Page.objects.filter( votes__isnull = True ).distinct().count()

reviewed_count = Page.objects.filter( votes__isnull = False ).distinct().count()

unreviewed_count = Page.objects.filter( is_reviewed = False ).count()

Migrating to InnoDB on a separate server

ssh mps-live "mysqldump mp_expenses" |sed 's/ENGINE=MyISAM/ENGINE=InnoDB/g' |

sed 's/CHARSET=latin1/CHARSET=utf8/g' |ssh mysql-big "mysql -u root mp_expenses"

“next” button

def next_global(request): # Next unreviewed page from the whole site all_unreviewed_pages = Page.objects.filter( is_reviewed = False ).order_by('?') if all_unreviewed_pages: return Redirect( all_unreviewed_pages[0].get_absolute_url() ) else: return HttpResponse( 'All pages have been reviewed!' )

import random

def next_global_from_cache(request): page_ids = cache.get('unreviewed_page_ids') if page_ids: return Redirect( '/page/%s/' % random.choice(page_ids) ) else: return next_global(request)

from django.core.management.base import BaseCommandfrom mp_expenses.expenses.models import Pagefrom django.core.cache import cache

class Command(BaseCommand): help = """ populate unreviewed_page_ids in memcached """ requires_model_validation = True can_import_settings = True def handle(self, *args, **options): ids = list(Page.objects.exclude( is_reviewed = True ).values_list('pk', flat=True)[:1000]) cache.set('unreviewed_page_ids', ids)

The numbers

Final thoughts

• High score tables help

• MP photographs really help

• Keeping up the interest is hard

• Next step: start releasing the data

Technology

Crowdsourcing with Django