Extending GrimoireLab capabilities

Preview:

Citation preview

Extending GrimoireLab capabilities

Alberto Pérez, Valerio Cosentino@alpgarcia, @_valcos_[alpgarcia, valcos]@bitergia.comhttps://speakerdeck.com/bitergia

GrimoireCon, Brussels, 02-02-2018

OutlineGrimoireLab overview

Use case

Data extraction

Data visualization

/grimoirelab

/grimoirelab

/use_case

Can you prepare a use case?

Sure, what do we show?

Commit’s authors and issues, ok?

/use_case

Can you prepare a use case?

Sure, what do we show?

Commit’s authors and issues, ok?

Ok, comics’ authors and issues

/use_case

Comics

Creators

Characters

Stories

/use_case

Comics

Creators

Characters

Stories

/data_extraction

Perceval

Goal -> retrieve information* from data sources

* information: collection of items (issues, commits, comics)

/data_extraction

Perceval

API (data source) and Perceval data{ "backend_name": "Marvel", "backend_version": "0.1.0", "category": "comic", "data": { "format": "Comic", "id": 37030,

"issueNumber": 2, "modified": "2010-08-04T01:32:01-0400",

"pageCount": 32,"prices": [...],

"characters": {...}, "characters_data": {...} }, "origin": "https://developer.marvel.com/", "perceval_version": "0.9.10", "tag": "https://developer.marvel.com/", "timestamp": 1517421033.892423, "updated_on": 1280899921.0, "uuid": "cc6fc7e818e48a18e498b2e865e554a1aa27b317" }

API data} ]Perceval data

/data_extraction

Perceval

Organization -> 3 actors

Backend Client

CommandLine

/fetch

Perceval

Operations -> fetch & fetch-from-archive

Backend Client

CommandLine

Marvel API

Perceval data

API data

Archive

/fetch

Perceval

Backenddef fetch(self, from_date=DEFAULT_DATETIME): ... from_date = datetime_to_utc(from_date)

kwargs = {"from_date": from_date} items = super().fetch("comic", **kwargs)

return items

/fetch

Perceval

Backenddef fetch(self, from_date=DEFAULT_DATETIME): ... from_date = datetime_to_utc(from_date)

kwargs = {"from_date": from_date} items = super().fetch("comic", **kwargs)

return items

def fetch(self, category, **kwargs): if self.archive: self.archive.init_metadata(...) self.client = self._init_client() for item in self.fetch_items(**kwargs): yield self.metadata(item)

def _init_client(self, from_archive=False): return MarvelClient(self.public_key, self.private_key, ..., self.max_retries, self.archive, from_archive)

/fetch

Perceval

Backenddef fetch(self, from_date=DEFAULT_DATETIME): ... from_date = datetime_to_utc(from_date)

kwargs = {"from_date": from_date} items = super().fetch("comic", **kwargs)

return items

def fetch(self, category, **kwargs): if self.archive: self.archive.init_metadata(...) self.client = self._init_client() for item in self.fetch_items(**kwargs): yield self.metadata(item)

def fetch_items(self, **kwargs): from_date = kwargs['from_date'] comic_groups = self.client.comics(from_date)

for comics in comic_groups: for comic in comics: ... comic['characters_data'] = self.client.comic_data(...) ... yield comic

/fetch

Perceval

Backenddef fetch(self, from_date=DEFAULT_DATETIME): ... from_date = datetime_to_utc(from_date)

kwargs = {"from_date": from_date} items = super().fetch("comic", **kwargs)

return items

def fetch(self, category, **kwargs): if self.archive: self.archive.init_metadata(...) self.client = self._init_client() for item in self.fetch_items(**kwargs): yield self.metadata(item)

def fetch_items(self, **kwargs): from_date = kwargs['from_date'] comic_groups = self.client.comics(from_date)

for comics in comic_groups: for comic in comics: ... comic['characters_data'] = self.client.comic_data(...) ... yield comic

Client

/fetch

Perceval

Clientdef comics(self, from_date=None): payload = { 'orderBy': 'modified', 'limit': self.items_per_page } if from_date: payload['modifiedSince'] = from_date.isoformat() ... path = urijoin(MARVEL_API_URL, "comics") return self.fetch_items(path, payload)

/fetch

Perceval

Clientdef fetch_items(self, path, payload): response = self.fetch(path, payload=payload) items_info = response.json()['data']

total = items_info['total'] count = items_info['count']

while True: yield items_info['results'] ...code for pagination..

def comics(self, from_date=None): payload = { 'orderBy': 'modified', 'limit': self.items_per_page } if from_date: payload['modifiedSince'] = from_date.isoformat() ... path = urijoin(MARVEL_API_URL, "comics") return self.fetch_items(path, payload)

/fetch

Perceval

Clientdef fetch_items(self, path, payload): response = self.fetch(path, payload=payload) items_info = response.json()['data']

total = items_info['total'] count = items_info['count']

while True: yield items_info['results'] ...code for pagination..

def fetch(self, url, payload=None, headers=None, ...): if self.from_archive: response = self._fetch_from_archive(url, payload, headers) else: response = self._fetch_from_remote(url, payload, headers, ...)

return response

def comics(self, from_date=None): payload = { 'orderBy': 'modified', 'limit': self.items_per_page } if from_date: payload['modifiedSince'] = from_date.isoformat() ... path = urijoin(MARVEL_API_URL, "comics") return self.fetch_items(path, payload)

/fetch

Perceval

Clientdef fetch_items(self, path, payload): response = self.fetch(path, payload=payload) items_info = response.json()['data']

total = items_info['total'] count = items_info['count']

while True: yield items_info['results'] ...code for pagination..

def fetch(self, url, payload=None, headers=None, ...): if self.from_archive: response = self._fetch_from_archive(url, payload, headers) else: response = self._fetch_from_remote(url, payload, headers, ...)

return response

def comics(self, from_date=None): payload = { 'orderBy': 'modified', 'limit': self.items_per_page } if from_date: payload['modifiedSince'] = from_date.isoformat() ... path = urijoin(MARVEL_API_URL, "comics") return self.fetch_items(path, payload)

def _fetch_from_remote(self, ...): response = ... try: response.raise_for_status() except Exception as e: response = e raise e finally: if self.archive: self.archive.store(..., response) return response

/fetch

Perceval

Recap

def fetch(self, from_date=DEFAULT_DATETIME):

def fetch(self, category, **kwargs):

def fetch_items(self, **kwargs):

def comics(self, from_date=None):

def fetch_items(self, path, payload):

def fetch(self, url, payload=None, headers=None, ...):

def _fetch_from_remote(self, ...):

ClientBackend

def _init_client(...):

/fetch-from-archive

Perceval

Operations -> fetch & fetch-from-archive

Backend Client

CommandLine

Archive

Perceval data

API data

/fetch-from-archive

Perceval

Backenddef fetch_from_archive(self): if not self.archive: raise ArchiveError(cause="...")

self.client = self._init_client(from_archive=True) self.archive._load_metadata()

for item in self.fetch_items(**self.archive.backend_params): yield self.metadata(item)

/fetch-from-archive

Perceval

Backenddef fetch_from_archive(self): if not self.archive: raise ArchiveError(cause="...")

self.client = self._init_client(from_archive=True) self.archive._load_metadata()

for item in self.fetch_items(**self.archive.backend_params): yield self.metadata(item)

def fetch_items(self, path, payload):

/fetch-from-archive

Perceval

Backenddef fetch_from_archive(self): if not self.archive: raise ArchiveError(cause="...")

self.client = self._init_client(from_archive=True) self.archive._load_metadata()

for item in self.fetch_items(**self.archive.backend_params): yield self.metadata(item)

Client

def fetch_items(self, path, payload):

/fetch-from-archive

Perceval

Clientdef fetch_items(self, path, payload):

def fetch(self, url, payload=None, headers=None, ...): if self.from_archive: response = self._fetch_from_archive(url, payload, headers) else: response = self._fetch_from_remote(url, payload, headers, ...)

return response

def comics(self, from_date=None):

/fetch-from-archive

Perceval

Clientdef fetch_items(self, path, payload):

def fetch(self, url, payload=None, headers=None, ...): if self.from_archive: response = self._fetch_from_archive(url, payload, headers) else: response = self._fetch_from_remote(url, payload, headers, ...)

return response

def comics(self, from_date=None):

def _fetch_from_archive(self, ...):

response = self.archive.retrieve(url, payload, headers)

if not isinstance(response, requests.Response): raise response

return response

/fetch-from-archive

Perceval

Recap

def fetch_from_archive(self):

def fetch_items(self, **kwargs):

def comics(self, from_date=None):

def fetch_items(self, path, payload):

def fetch(self, url, payload=None, headers=None, ...):

def _fetch_from_archive(self, ...):

ClientBackend

def _init_client(...):

/grimoirelab

/data_visualizationRaw index:

comic: {comic_id: …,title: …,creators: [{

name: …,role: …

},{…}],a lot of additional info

}

Problem: there is no way to associate author and role in Kibana.

/data_visualizationEnriched index:

author: {comic_id: …,title: …,name: …,role: …,only some carefully selected info

}

Solution: Store data from author point of view.

/data_visualization

/data_visualizationWe needed some help, but our colleagues were a bit busy...

/data_visualizationWe needed some help, but our colleagues were a bit busy...

/data_visualization

Hey Álvaro, we need you as the one and only expert in Gelk!

Mmmm, what do you need guys?

We need to enrich some data related to….Marvel comics

Are you kidding me? Marvel comics???

/data_visualization

Hey Álvaro, we need you as the one and only expert in Gelk!

Mmmm, what do you need guys?

We need to enrich some data related to….Marvel comics

Are you kidding me? Marvel comics???

I’M IN!!!

def enrich_items(self, ocean_backend): .... for item in items: creators = self.get_rich_item_creators(item) rich_item_creators += creators

if rich_item_creators: ncreators = self.elastic.bulk_upload(rich_item_creators, "id")

/data_visualization

Extend Enrich class:

class MarvelEnrich(Enrich):

From each raw item (comic) create N enriched items (creators):

For each comic, extract creators

Upload new items

/data_visualization

For each creator just copy things from here to there:

# Thumbnailseitem['url_thumbnail'] = item['data']['thumbnail']['path']

And add some common fields:

def get_rich_item_creators(self, item): ... for creator in item['data']['creators']['items']: ecreator = self.get_rich_comic_creator(item, creator) creators_enrich.append(ecreator)

return (creators_enrich)

/data_visualization...some hours of Kibana hacking later...

/data_visualization...happy hacking hours, let me say...

/data_visualization...and after some hours more with some help of @dmoreno

/data_visualization

...and after some hours more

/resources

grimoirelab/panelsgrimoirelab/perceval

alpgarcia/grimoirecon18/marvelalpgarcia/grimoireELK/tree/marvel-enrichvaleriocos/perceval/tree/marvel-backend

@grimoirelab@alpgarcia@_valcos_

Recommended