# coding: UTF-8
import itertools
import locale
def natsplit(ustr):
"""
Split @ustr into runs of normal characters
and numbers.
>>> natsplit(u"file1.txt")
(u'file', 1, u'.txt')
"""
parts = []
for isdig, group in itertools.groupby(ustr, unicode.isdigit):
part = u"".join(group)
parts.append(int(part) if isdig else part)
return tuple(parts)
def keyfunc(ntupl):
u"""
Return a locale-aware key for tuple @ntupl
>>> keyfunc((u'file', 1, u'.txt'))
('file', 1, '.txt')
# No way to test this: (depends on locale)
locale.setlocale(locale.LC_ALL, '')
keyfunc((u'löv', 1, u'.txt'))
('l\xc3\xb6v', 1, '.txt')
"""
def _encode(nt):
for obj in nt:
# NB! localestrxfrm is broken for unicode, so
# we have to encode into UTF-8 here!
try:
yield locale.strxfrm(obj.encode("UTF-8"))
except AttributeError:
yield obj
return tuple(_encode(ntupl))
def sortcorpus(corpus):
"""
Sort @corpus according to current locale
(The caller needs to call setlocale)
>>> sortcorpus(u'''
... f0.txt f10.txt f100.txt f105.txt f110.txt f15.txt f20.txt f25.txt
... f30.txt f35.txt f40.txt f45.txt f5.txt f50.txt f55.txt f60.txt f65.txt
... f70.txt f75.txt f80.txt f85.txt f90.txt f95.txt
... '''.split()) # doctest: +NORMALIZE_WHITESPACE
[u'f0.txt', u'f5.txt', u'f10.txt', u'f15.txt', u'f20.txt', u'f25.txt',
u'f30.txt', u'f35.txt', u'f40.txt', u'f45.txt', u'f50.txt', u'f55.txt',
u'f60.txt', u'f65.txt', u'f70.txt', u'f75.txt', u'f80.txt', u'f85.txt',
u'f90.txt', u'f95.txt', u'f100.txt', u'f105.txt', u'f110.txt']
"""
def key(o):
return keyfunc(natsplit(o))
return sorted(corpus, key=key)
if __name__ == '__main__':
# hack to use unicode in docstrings..
# not needed now but..
import sys
reload(sys)
sys.setdefaultencoding("UTF-8")
import doctest
doctest.testmod()