Java, 3416 bytes, 62%
this is my solution, i analyze list of given words and find 60 most commons bigrams and trigrams for each languages. Now i'm checking my n-grams against word, and choosing language with most n-grams in word.
public class Classificator {
String[][] triGr = {
{"ing","ion","ent","tio","ted","nce","ter","res","ati","con","ess","ate","pro","ain","est","ons","men","ect","red","rea","com","ere","ers","nte","ine","her","ble","ist","tin","for","per","der","ear","str","ght","pre","ver","int","nde","the","igh","ive","sta","ure","end","enc","ned","ste","dis","ous","all","and","anc","ant","oun","ten","tra","are","sed","cti"},
{"sch","che","ver","gen","ten","cht","ich","ein","ste","ter","hen","nde","nge","ach","ere","ung","den","sse","ers","and","eit","ier","ren","sen","ges","ang","ben","rei","est","nen","nte","men","aus","der","ent","hei","her","lle","ern","ert","uch","ine","ehe","auf","lie","tte","ige","ing","hte","mme","end","wei","len","hre","rau","ite","bes","ken","cha","ebe"},
{"ent","are","ato","nte","ett","ere","ion","chi","con","one","men","nti","gli","pre","ess","att","tto","par","per","sta","tra","zio","and","iam","end","ter","res","est","nto","tta","acc","sci","cia","ver","ndo","amo","ant","str","tro","ssi","pro","era","eri","nta","der","ate","ort","com","man","tor","rat","ell","ale","gio","ont","col","tti","ano","ore","ist"},
{"sze","ere","meg","ett","gye","ele","ond","egy","enn","ott","tte","ete","unk","ban","tem","agy","zer","esz","tet","ara","nek","hal","dol","mon","art","ala","ato","szt","len","men","ben","kap","ent","min","ndo","eze","sza","isz","fog","kez","ind","ten","tam","nak","fel","ene","all","asz","gon","mar","zem","szo","tek","zet","elm","het","eve","ssz","hat","ell"}
};
static String[][] biGr = {
{"in","ed","re","er","es","en","on","te","ng","st","nt","ti","ar","le","an","se","de","at","ea","co","ri","ce","or","io","al","is","it","ne","ra","ro","ou","ve","me","nd","el","li","he","ly","si","pr","ur","th","di","pe","la","ta","ss","ns","nc","ll","ec","tr","as","ai","ic","il","us","ch","un","ct"},
{"en","er","ch","te","ge","ei","st","an","re","in","he","ie","be","sc","de","es","le","au","se","ne","el","ng","nd","un","ra","ar","nt","ve","ic","et","me","ri","li","ss","it","ht","ha","la","is","al","eh","ll","we","or","ke","fe","us","rt","ig","on","ma","ti","nn","ac","rs","at","eg","ta","ck","ol"},
{"re","er","to","ar","en","te","ta","at","an","nt","ra","ri","co","on","ti","ia","or","io","in","st","tt","ca","es","ro","ci","di","li","no","ma","al","am","ne","me","le","sc","ve","sa","si","tr","nd","se","pa","ss","et","ic","na","pe","de","pr","ol","mo","do","so","it","la","ce","ie","is","mi","cc"},
{"el","en","sz","te","et","er","an","me","ta","on","al","ar","ha","le","gy","eg","re","ze","em","ol","at","ek","es","tt","ke","ni","la","ra","ne","ve","nd","ak","ka","in","am","ad","ye","is","ok","ba","na","ma","ed","to","mi","do","om","be","se","ag","as","ez","ot","ko","or","cs","he","ll","nn","ny"}
};
public int guess(String word) {
if (word.length() < 3) {
return 4; // most words below 2 characters on list are hungarians
}
int score[] = { 0, 0, 0, 0 };
for (int i = 0; i < 4; i++) {
for (String s : triGr[i]) {
if (word.contains(s)) {
score[i] = score[i] + 2;
}
}
for (String s : biGr[i]) {
if (word.contains(s)) {
score[i] = score[i] + 1;
}
}
}
int v = -1;
int max = 0;
for (int i = 0; i < 4; i++) {
if (score[i] > max) {
max = score[i];
v = i;
}
}
v++;
return v==0?Math.round(4)+1:v;
}
}
and this is my testcase
public class Test {
Map<String, List<Integer>> words = new HashMap<String, List<Integer>>();
boolean validate(String word, Integer lang) {
List<Integer> langs = words.get(word);
return langs.contains(lang);
}
public static void main(String[] args) throws FileNotFoundException {
FileReader reader = new FileReader("list.txt");
BufferedReader buf = new BufferedReader(reader);
Classificator cl = new Classificator();
Test test = new Test();
buf.lines().forEach(x -> test.process(x));
int guess = 0, words = 0;
for (String word : test.words.keySet()) {
int lang = cl.guess(word);
if (lang==0){
continue;
}
boolean result = test.validate(word, lang);
words++;
if (result) {
guess++;
}
}
System.out.println(guess+ " "+words+ " "+(guess*100f/words));
}
private void process(String x) {
String arr[] = x.split("\\s+");
String word = arr[0].trim();
List<Integer> langs = words.get(word);
if (langs == null) {
langs = new ArrayList<Integer>();
words.put(word, langs);
}
langs.add(Integer.parseInt(arr[1].trim()));
}
}
are you sure the lists are correct? Im pretty sure i never heard the in german. Does outputting an array with all possible languages count? e.g. the apparently is in all languages so it woul put {1,2,3,4} – Eumel – 2016-02-04T18:43:58.767
@Eumel The first couple English words might be present somewhere in the other lists as there might be English phrases in the texts of languages which were used to generate the word lists. Your can categorize an input into only one language. (Whihch means as mentioned in the question that "your code can't always guess the expected output correctly".) – randomra – 2016-02-04T19:49:56.277
The lists only contain words with lowercase letters ... That's not entirely true. The
all_languages
file includes dozens of capitalized words (Mr
,Gutenberg
, etc.) and the non-words "" (empty string) and "]]|-". I assume it's OK to lowercase the former and delete the latter? – r3mainer – 2016-02-04T22:38:08.803@squeamishossifrage Thanks for the catch. Updated the English lists. (There were ~60 uppercase words and 2 non-words.) – randomra – 2016-02-05T11:57:18.780
Why remove diacritics? If the goal is to distinguish languages that don’t have diacritics, then why not use languages that don’t have diacritics? – pat – 2016-05-16T22:16:21.690
"p" is a word? (all words list) – ev3commander – 2016-05-18T22:16:10.610