import isEmpty from './utils/isEmpty';
import isCharEnglishPunctuation from './utils/isCharEnglishPunctuation';
import isCharJapanesePunctuation from './utils/isCharJapanesePunctuation';
import isCharRomaji from './utils/isCharRomaji';
import isCharKanji from './utils/isCharKanji';
import isCharHiragana from './utils/isCharHiragana';
import isCharKatakana from './utils/isCharKatakana';
import isCharJapanese from './utils/isCharJapanese';
const isCharEnSpace = (x) => x === ' ';
const isCharJaSpace = (x) => x === ' ';
const isCharJaNum = (x) => /[0-9]/.test(x);
const isCharEnNum = (x) => /[0-9]/.test(x);
const TOKEN_TYPES = {
EN: 'en',
JA: 'ja',
EN_NUM: 'englishNumeral',
JA_NUM: 'japaneseNumeral',
EN_PUNC: 'englishPunctuation',
JA_PUNC: 'japanesePunctuation',
KANJI: 'kanji',
HIRAGANA: 'hiragana',
KATAKANA: 'katakana',
SPACE: 'space',
OTHER: 'other',
};
// prettier-ignore
export function getType(input, compact = false) {
const {
EN, JA, EN_NUM, JA_NUM, EN_PUNC, JA_PUNC, KANJI, HIRAGANA, KATAKANA, SPACE, OTHER,
} = TOKEN_TYPES;
if (compact) {
switch (true) {
case isCharJaNum(input): return OTHER;
case isCharEnNum(input): return OTHER;
case isCharEnSpace(input): return EN;
case isCharEnglishPunctuation(input): return OTHER;
case isCharJaSpace(input): return JA;
case isCharJapanesePunctuation(input): return OTHER;
case isCharJapanese(input): return JA;
case isCharRomaji(input): return EN;
default: return OTHER;
}
} else {
switch (true) {
case isCharJaSpace(input): return SPACE;
case isCharEnSpace(input): return SPACE;
case isCharJaNum(input): return JA_NUM;
case isCharEnNum(input): return EN_NUM;
case isCharEnglishPunctuation(input): return EN_PUNC;
case isCharJapanesePunctuation(input): return JA_PUNC;
case isCharKanji(input): return KANJI;
case isCharHiragana(input): return HIRAGANA;
case isCharKatakana(input): return KATAKANA;
case isCharJapanese(input): return JA;
case isCharRomaji(input): return EN;
default: return OTHER;
}
}
}
/**
* Splits input into array of strings separated by opinionated token types
* `'en', 'ja', 'englishNumeral', 'japaneseNumeral','englishPunctuation', 'japanesePunctuation','kanji', 'hiragana', 'katakana', 'space', 'other'`.
* If `{ compact: true }` then many same-language tokens are combined (spaces + text, kanji + kana, numeral + punctuation).
* If `{ detailed: true }` then return array will contain `{ type, value }` instead of `'value'`
* @param {String} input text
* @param {{compact: Boolean | undefined, detailed: Boolean | undefined}} [options={ compact: false, detailed: false}] options to modify output style
* @return {(String[]|Array.<{type: String, value: String}>)} text split into tokens containing values, or detailed object
* @example
* tokenize('ふふフフ')
* // ['ふふ', 'フフ']
*
* tokenize('感じ')
* // ['感', 'じ']
*
* tokenize('人々')
* // ['人々']
*
* tokenize('truly 私は悲しい')
* // ['truly', ' ', '私', 'は', '悲', 'しい']
*
* tokenize('truly 私は悲しい', { compact: true })
* // ['truly ', '私は悲しい']
*
* tokenize('5romaji here...!?人々漢字ひらがなカタ カナ4「SHIO」。!')
* // [ '5', 'romaji', ' ', 'here', '...!?', '人々漢字', 'ひらがな', 'カタ', ' ', 'カナ', '4', '「', 'SHIO', '」。!']
*
* tokenize('5romaji here...!?人々漢字ひらがなカタ カナ4「SHIO」。!', { compact: true })
* // [ '5', 'romaji here', '...!?', '人々漢字ひらがなカタ カナ', '4「', 'SHIO', '」。!']
*
* tokenize('5romaji here...!?人々漢字ひらがなカタ カナ4「SHIO」。! لنذهب', { detailed: true })
* // [
* { type: 'englishNumeral', value: '5' },
* { type: 'en', value: 'romaji' },
* { type: 'space', value: ' ' },
* { type: 'en', value: 'here' },
* { type: 'englishPunctuation', value: '...!?' },
* { type: 'kanji', value: '人々漢字' },
* { type: 'hiragana', value: 'ひらがな' },
* { type: 'katakana', value: 'カタ' },
* { type: 'space', value: ' ' },
* { type: 'katakana', value: 'カナ' },
* { type: 'japaneseNumeral', value: '4' },
* { type: 'japanesePunctuation', value: '「' },
* { type: 'ja', value: 'SHIO' },
* { type: 'japanesePunctuation', value: '」。!' },
* { type: 'space', value: ' ' },
* { type: 'other', value: 'لنذهب' },
* ]
*
* tokenize('5romaji here...!?人々漢字ひらがなカタ カナ4「SHIO」。! لنذهب', { compact: true, detailed: true})
* // [
* { type: 'other', value: '5' },
* { type: 'en', value: 'romaji here' },
* { type: 'other', value: '...!?' },
* { type: 'ja', value: '人々漢字ひらがなカタ カナ' },
* { type: 'other', value: '4「' },
* { type: 'ja', value: 'SHIO' },
* { type: 'other', value: '」。!' },
* { type: 'en', value: ' ' },
* { type: 'other', value: 'لنذهب' },
*]
*/
function tokenize(input, { compact = false, detailed = false } = {}) {
if (input == null || isEmpty(input)) {
return [];
}
const chars = [...input];
let initial = chars.shift();
let prevType = getType(initial, compact);
initial = detailed ? { type: prevType, value: initial } : initial;
const result = chars.reduce(
(tokens, char) => {
const currType = getType(char, compact);
const sameType = currType === prevType;
prevType = currType;
let newValue = char;
if (sameType) {
newValue = (detailed ? tokens.pop().value : tokens.pop()) + newValue;
}
return detailed
? tokens.concat({ type: currType, value: newValue })
: tokens.concat(newValue);
},
[initial]
);
return result;
}
export default tokenize;