Source code for bogo.utils

# -*- coding: utf-8 -*-
#
# This file is part of ibus-bogo project.
#
# Copyright (C) 2012 Long T. Dam <longdt90@gmail.com>
# Copyright (C) 2012-2013 Trung Ngo <ndtrung4419@gmail.com>
# Copyright (C) 2013 Duong H. Nguyen <cmpitg@gmail.com>
#
# ibus-bogo is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ibus-bogo is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ibus-bogo.  If not, see <http://www.gnu.org/licenses/>.
#

from __future__ import unicode_literals


VOWELS = "àáảãạaằắẳẵặăầấẩẫậâèéẻẽẹeềếểễệêìíỉĩịi" + \
         "òóỏõọoồốổỗộôờớởỡợơùúủũụuừứửữựưỳýỷỹỵy"


[docs]def join(alist):
    return "".join(alist)


[docs]def is_vowel(char):
    char = char.lower()
    return char in VOWELS


[docs]def change_case(string, case):
    """
    Helper: Return new string obtained from change the given string to
    desired case.

    Args
        string
        case - 0: lower, 1: upper
    """
    return string.upper() if case else string.lower()


[docs]def append_comps(comps, char):
    """
    Append a character to `comps` following this rule: a vowel is added to the
    vowel part if there is no last consonant, else to the last consonant part;
    a consonant is added to the first consonant part if there is no vowel, and
    to the last consonant part if the vowel part is not empty.

    >>> transform(['', '', ''])
    ['c', '', '']
    >>> transform(['c', '', ''], '+o')
    ['c', 'o', '']
    >>> transform(['c', 'o', ''], '+n')
    ['c', 'o', 'n']
    >>> transform(['c', 'o', 'n'], '+o')
    ['c', 'o', 'no']
    """
    c = list(comps)
    if is_vowel(char):
        if not c[2]: pos = 1
        else: pos = 2
    else:
        if not c[2] and not c[1]: pos = 0
        else: pos = 2
    c[pos] += char
    return c


# def gibberish_split(head, tail=""):
#     """
#     Try to split a string into two parts: the alphabetic part at the end and the
#     rest.

#     >>> gibberish_split("aoeu")
#     ("", "aoeu")
#     >>> gibberish_split("ao.eu")
#     ("ao.", "eu")
#     >>> gibberish_split("aoeu.")
#     ("aoeu.", "")
#     """
#     if head == "" or not head[-1].isalpha():
#         return (head, tail)
#     else:
#         return gibberish_split(head[:-1], head[-1] + tail)


[docs]def separate(string):
    """
    Separate a string into smaller parts: first consonant (or head), vowel,
    last consonant (if any).

    >>> separate('tuong')
    ['t','uo','ng']
    >>> separate('ohmyfkinggod')
    ['ohmyfkingg','o','d']
    """
    def atomic_separate(string, last_chars, last_is_vowel):
        if string == "" or (last_is_vowel != is_vowel(string[-1])):
            return (string, last_chars)
        else:
            return atomic_separate(string[:-1],
                                   string[-1] + last_chars, last_is_vowel)

    head, last_consonant = atomic_separate(string, "", False)
    first_consonant, vowel = atomic_separate(head, "", True)

    if last_consonant and not (vowel + first_consonant):
        comps = [last_consonant, '', '']  # ['', '', b] -> ['b', '', '']
    else:
        comps = [first_consonant, vowel, last_consonant]

    # 'gi' and 'qu' are considered qualified consonants.
    # We want something like this:
    #     ['g', 'ia', ''] -> ['gi', 'a', '']
    #     ['q', 'ua', ''] -> ['qu', 'a', '']
    if (comps[0] != '' and comps[1] != '') and \
        ((comps[0] in 'gG' and comps[1][0] in 'iI' and len(comps[1]) > 1) or
         (comps[0] in 'qQ' and comps[1][0] in 'uU')):
        comps[0] += comps[1][:1]
        comps[1] = comps[1][1:]

    return comps