pa-pe-pi-po-pure Python Text Processing

Embed Size (px)

DESCRIPTION

Experimentos com processamento de texto, da manipulação de strings básica até um exemplo de NLP, passando por compiladores.

Citation preview

2. Anatomia do Bl Eu, Vocs e Python retrospectiva PythonBrasil[7] anos! pa-pe-pi-po-pure python text processing referncias 1 palavra dos patrocinadores 3. Quem est a ?Prossionais deInformticaDesenvolvedoresEstudantesProfessores1 vez na PyConBrasilMembros APyBr Nenhuma resposta acima! 4. Cenas dos ltimos captulos...[1] 2005 - BigKahuna[2] 2006 - Show Pyrotcnico Iteradores, Geradores,Hooks,Decoradores[3] 2007 - Show Pyrotcnico II Routing, RTSP, Twisted, GIS[4] 2008 - ISIS-NBPBibliotecas Digitais[5] 2009 - Rest, Gtw e Compiladores SFC(Rede Petri) + ST(Pascal) > Ladder[5] 2010 - Potter vs Voldemort: Lies odiglotas da prtica pythonica 5. >>> type("bla")>>> "".join([pa,"pe",pi,"""po"""])papepipo>>> str(2**1024)[100:120]21120113879871393357>>> 2**1024179769313486231590772930519078902473361797697894230657273430081157732675805500963132708477322407536021120113879871393357658789768814416622492847430639474124377767893424865485276302219601246094119453082952085005768838150682342462881473913110540827237163350510684586298239947245938479716304835356329624224137216L>>> ariediod[::-1]doideira 6. >>> "deu branco no prexo e no suxo, limpa com strip ".strip()deu branco no prexo e no suxo, limpa com strip>>> _.startswith("deu")True>>> "o rato roeu a roupa do rei de roma".partition("r")(o , r, ato roeu a roupa do rei de roma)>>> "o rato roeu a roupa do rei de roma".split("r")[o , ato , oeu a , oupa do , ei de , oma]>>> "o rato roeu a roupa do rei de roma".split()[o, rato, roeu, a, roupa, do, rei, de, roma] 7. >>> r"W:naoprecisadeescape"W:naoprecisadeescape>>> type(r"W:naoprecisadeescape")>>> type(u"Unicode")>>> print(u"xc3xa2")Traceback (most recent call last):File "", line 1, in UnicodeEncodeError: ascii codec cant encode characters in position 0-1: ordinal not in range(128)>>> print(unicode(xc3xa1,iso-8859-1).encode(iso-8859-1))>>> import codecs, sys>>> sys.stdout = codecs.lookup(iso-8859-1)[-1](sys.stdout)>>> print(u"xc3xa1") 8. >>> b"String de 8-bit chars" String de 8-bit charsPython 2.6.1Python 3.1.4>>> b"Bla">>> b"Bla"Bla bBla>>> b"Bla"=="Bla" >>> type(b"Bla")True>>> type(b"Bla")>>> type("Bla")>>> "Bla"==b"Bla"False 9. >>> [ord(i) for i in "nulalexsedlex"][110, 117, 108, 97, 108, 101, 120, 115, 101, 100, 108, 101, 120]>>> "".join([chr(i) for i in _])nulalexsedlex>>> lex in _True>>> import string>>> dir(string)[Formatter, Template, _TemplateMetaclass, __builtins__,__doc__, __le__, __name__, __package__, _oat, _idmap,_idmapL, _int, _long, _multimap, _re, ascii_letters,ascii_lowercase, ascii_uppercase, atof, atof_error, atoi,atoi_error, atol, atol_error, capitalize, capwords, center, count,digits, expandtabs, nd, hexdigits, index, index_error, join,joinelds, letters, ljust, lower, lowercase, lstrip, maketrans,octdigits, printable, punctuation, replace, rnd, rindex, rjust,rsplit, rstrip, split, splitelds, strip, swapcase, translate, upper,uppercase, whitespace, zll] 10. >>> string.hexdigits0123456789abcdefABCDEF>>> string.punctuation!"#$%&()*+,-./:;?@[]^_`{|}~>>> string.maketrans(,)x00x01x02x03x04x05x06x07x08tnx0bx0crx0ex0fx10x11x12x13x14x15x16x17x18x19x1ax1bx1cx1dx1ex1f !"#$%&()*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^_`abcdefghijklmnopqrstuvwxyz{|}~x7fx80x81x82x83x84x85x86x87x88x89x8ax8bx8cx8dx8ex8fx90x91x92x93x94x95x96x97x98x99x9ax9bx9cx9dx9ex9fxa0xa1xa2xa3xa4xa5xa6xa7xa8xa9xaaxabxacxadxaexafxb0xb1xb2xb3xb4xb5xb6xb7xb8xb9xbaxbbxbcxbdxbexbfxc0xc1xc2xc3xc4xc5xc6xc7xc8xc9xcaxcbxccxcdxcexcfxd0xd1xd2xd3xd4xd5xd6xd7xd8xd9xdaxdbxdcxddxdexdfxe0xe1xe2xe3xe4xe5xe6xe7xe8xe9xeaxebxecxedxeexefxf0xf1xf2xf3xf4xf5xf6xf7xf8xf9xfaxfbxfcxfdxfexff 11. >>> def t(x,y): return string.translate(x,string.maketrans(,),y)...>>> t("O rato roeu. O que? A roupa! De quem? Do rei, de roma;",string.punctuation)O rato roeu O que A roupa De quem Do rei de roma>>> class Bla(object):... def __str__(self):... return "Belex"... def __repr__(self):... return "Bla()"...>>> b = Bla()>>> for i in [b, eval(repr(b))]:... print(i, end=t)...Belex Belex >>> 12. >>> class istr(str):...pass>>> for name in eq lt le gt ge ne cmp contains.split():...meth = getattr(str, __%s__ % name)... def new_meth(self, param, *args):...return meth(self.lower(), param.lower(), *args)... setattr(istr, __%s__% name, new_meth)...>>> istr("SomeCamelCase") == istr("sOmeCaMeLcase")True>>> Ec in istr("SomeCamel")TrueAdapted from Python Cookbook 13. >>> import re>>> pat = re.compile(re.escape(""))>>> re.escape("")>>> pat.sub("_","Hasta la vista baby")_Hasta la vista_ baby>>> date = re.compile(r"(dddd-dd-dd)s(w+)")>>> date.ndall("Em 2011-09-29 PythonBrasil na parada. Em 2010-10-21curitiba hospedou")[(2011-09-29, PythonBrasil), (2010-10-21, curitiba)] 14. $ python -mtimeit -s "import re; n=re.compile(rabra)" "n.search(abracadabra)"1000000 loops, best of 3: 0.306 usec per loop$ python -mtimeit -s "import re; n=rabra" "n in abracadabra"10000000 loops, best of 3: 0.0591 usec per loop$ python -mtimeit -s "import re; n=re.compile(rd+$)" "n.match(0123456789)"1000000 loops, best of 3: 0.511 usec per loop$ python -mtimeit -s "import re" "0123456789.isdigit()"10000000loops, best of 3: 0.0945 usec per loopExtracted from PyMag Jan 2008 15. $ python -mtimeit -s "import re;r=re.compile(pa|pe|pi|po|pu);h=patapetapitapotapuxa"r.search(h)"1000000 loops, best of 3: 0.383 usec per loop$ python -mtimeit -s "import re;n=[pa,pe,pi,po,pu];h=patapetapitapotapuxa""any(x in h for x in n)"1000000 loops, best of 3: 0.914 usec per loopExtracted from PyMag Jan 2008 16. from pyparsing import Word, Literal, Combineimport stringdef doSum(s,l,tokens):return int(tokens[0]) + int(tokens[2])integer = Word(string.digits)addition = Combine(integer) + Literal(+) + Combine(integer)addition.setParseAction(doSum)>>> addition.parseString("5+7")([12], {}) 17. import ply.lex as lextokens = NUMBER, PLUSt_PLUS = r+def t_NUMBER(t): rd+ t.value = int(t.value) return tt_ignore =tnwdef t_error(t): t.lexer.skip(1)lexer = lex.lex()Adapted from http://www.dabeaz.com 18. import ply.yacc as yaccdef p_expression_plus(p): expression : expression PLUS expression p[0] = p[1] + p[3]def p_factor_num(p): expression : NUMBER p[0] = p[1]def p_error(p): print "Syntax error in input!"parser = yacc.yacc() Adapted from http://www.dabeaz.com 19. >>> parser.parse("1+2 + 45 n + 10")58>>> parser.parse("Quanto vale 2 + 7")9>>> parser.parse("A soma 2 + 7 resulta em 9")Syntax error in input!>>> parser.parse("2 + 7 9")Syntax error in input! Adapted from http://www.dabeaz.com 20. >>> parser.parse("1+2 + 45 n + 10")58>>> parser.parse("Quanto vale 2 + 7")9>>> parser.parse("A soma 2 + 7 resulta em 9")Syntax error in input!>>> parser.parse("2 + 7 9")Syntax error in input! Adapted from http://www.dabeaz.com 21. from nltk.tokenize import sent_tokenize, word_tokenizemsg = Congratulations to Erico and his team. PythonBrasil gets betterevery year. You are now the BiggestKahuna.>>> sent_tokenize(msg)[Congratulations to Erico and his team., PythonBrasil gets better everyyear., You are now the BiggestKahuna.]>>> word_tokenize(msg)[Congratulations, to, Erico, and, his, team., PythonBrasil, gets,better, every, year., You, are, now, the, BiggestKahuna, .] Extracted from NLP with Python 22. >>> def gender_features(word):...return {"last_letter": word[-1]}...>>> from nltk.corpus import names>>> len(names.words("male.txt"))2943>>> names = ([(name,male) for name in names.words(male.txt)] +...[(name,female) for name in names.words(female.txt)])>>> import random>>> random.shufe(names)>>> featuresets = [(gender_features(n),g) for n,g in names]>>> train_set, test_set = featuresets[500:], featuresets[:500]>>> classier = nltk.naiveBayesClassier.train(train_set)>>> classier.classify(gender_features("Dorneles"))male>>> classier.classify(gender_features("Magali"))femaleExtracted from NLP with Python 23. Referncias 24. Uma palavra dos patrocinadores... 25. Obrigado a todos pela ateno.Rodrigo Dias Arruda Senra http://[email protected] opinies e concluses expressas nesta apresentao so de exclusiva responsabilidade de Rodrigo Senra.No necessrio requisitar permisso do autor para o uso de partes ou do todo desta apresentao, desde queno sejam feitas alteraes no contedo reutilizado e que esta nota esteja presente na ntegra no materialresultante.Imagens e referncias para outros trabalhos nesta apresentao permanecem propriedade daqueles que detmseus direitos de copyright.