Since length is listed as a criterion, here's the golfed version at 1681 characters (could probably still be improved by 10%):
import java.io.*;import java.util.*;public class W{public static void main(String[]
a)throws Exception{int n=a.length<1?5:a[0].length(),p,q;String f,t,l;S w=new S();Scanner
s=new Scanner(new
File("sowpods"));while(s.hasNext()){f=s.next();if(f.length()==n)w.add(f);}if(a.length<1){String[]x=w.toArray(new
String[0]);Random
r=new Random();q=x.length;p=r.nextInt(q);q=r.nextInt(q-1);f=x[p];t=x[p>q?q:q+1];}else{f=a[0];t=a[1];}H<S>
A=new H(),B=new H(),C=new H();for(String W:w){A.put(W,new
S());for(p=0;p<n;p++){char[]c=W.toCharArray();c[p]='.';l=new
String(c);A.get(W).add(l);S z=B.get(l);if(z==null)B.put(l,z=new
S());z.add(W);}}for(String W:A.keySet()){C.put(W,w=new S());for(String
L:A.get(W))for(String b:B.get(L))if(b!=W)w.add(b);}N m,o,ñ;H<N> N=new H();N.put(f,m=new
N(f,t));N.put(t,o=new N(t,t));m.k=0;N[]H=new
N[3];H[0]=m;p=H[0].h;while(0<1){if(H[0]==null){if(H[1]==H[2])break;H[0]=H[1];H[1]=H[2];H[2]=null;p++;continue;}if(p>=o.k-1)break;m=H[0];H[0]=m.x();if(H[0]==m)H[0]=null;for(String
v:C.get(m.s)){ñ=N.get(v);if(ñ==null)N.put(v,ñ=new N(v,t));if(m.k+1<ñ.k){if(ñ.k<ñ.I){q=ñ.k+ñ.h-p;N
Ñ=ñ.x();if(H[q]==ñ)H[q]=Ñ==ñ?null:Ñ;}ñ.b=m;ñ.k=m.k+1;q=ñ.k+ñ.h-p;if(H[q]==null)H[q]=ñ;else{ñ.n=H[q];ñ.p=ñ.n.p;ñ.n.p=ñ.p.n=ñ;}}}}if(o.b==null)System.out.println(f+"\n"+t+"\nOY");else{String[]P=new
String[o.k+2];P[o.k+1]=o.k-1+"";m=o;for(q=m.k;q>=0;q--){P[q]=m.s;m=m.b;}for(String
W:P)System.out.println(W);}}}class N{String s;int k,h,I=(1<<30)-1;N b,p,n;N(String S,String
d){s=S;for(k=0;k<d.length();k++)if(d.charAt(k)!=S.charAt(k))h++;k=I;p=n=this;}N
x(){N r=n;n.p=p;p.n=n;n=p=this;return r;}}class S extends HashSet<String>{}class H<V>extends
HashMap<String,V>{}
The ungolfed version, which uses package names and methods and doesn't give warnings or extend classes just to alias them, is:
package com.akshor.pjt33;
import java.io.*;
import java.util.*;
// WordLadder partially golfed and with reduced dependencies
//
// Variables used in complexity analysis:
// n is the word length
// V is the number of words (vertex count of the graph)
// E is the number of edges
// hash is the cost of a hash insert / lookup - I will assume it's constant, but without completely brushing it under the carpet
public class WordLadder2
{
private Map<String, Set<String>> wordsToWords = new HashMap<String, Set<String>>();
// Initialisation cost: O(V * n * (n + hash) + E * hash)
private WordLadder2(Set<String> words)
{
Map<String, Set<String>> wordsToLinks = new HashMap<String, Set<String>>();
Map<String, Set<String>> linksToWords = new HashMap<String, Set<String>>();
// Cost: O(Vn * (n + hash))
for (String word : words)
{
// Cost: O(n*(n + hash))
for (int i = 0; i < word.length(); i++)
{
// Cost: O(n + hash)
char[] ch = word.toCharArray();
ch[i] = '.';
String link = new String(ch).intern();
add(wordsToLinks, word, link);
add(linksToWords, link, word);
}
}
// Cost: O(V * n * hash + E * hash)
for (Map.Entry<String, Set<String>> from : wordsToLinks.entrySet()) {
String src = from.getKey();
wordsToWords.put(src, new HashSet<String>());
for (String link : from.getValue()) {
Set<String> to = linksToWords.get(link);
for (String snk : to) {
// Note: equality test is safe here. Cost is O(hash)
if (snk != src) add(wordsToWords, src, snk);
}
}
}
}
public static void main(String[] args) throws IOException
{
// Cost: O(filelength + num_words * hash)
Map<Integer, Set<String>> wordsByLength = new HashMap<Integer, Set<String>>();
BufferedReader br = new BufferedReader(new FileReader("sowpods"), 8192);
String line;
while ((line = br.readLine()) != null) add(wordsByLength, line.length(), line);
if (args.length == 2) {
String from = args[0].toUpperCase();
String to = args[1].toUpperCase();
new WordLadder2(wordsByLength.get(from.length())).findPath(from, to);
}
else {
// 5-letter words are the most interesting.
String[] _5 = wordsByLength.get(5).toArray(new String[0]);
Random rnd = new Random();
int f = rnd.nextInt(_5.length), g = rnd.nextInt(_5.length - 1);
if (g >= f) g++;
new WordLadder2(wordsByLength.get(5)).findPath(_5[f], _5[g]);
}
}
// O(E * hash)
private void findPath(String start, String dest) {
Node startNode = new Node(start, dest);
startNode.cost = 0; startNode.backpointer = startNode;
Node endNode = new Node(dest, dest);
// Node lookup
Map<String, Node> nodes = new HashMap<String, Node>();
nodes.put(start, startNode);
nodes.put(dest, endNode);
// Heap
Node[] heap = new Node[3];
heap[0] = startNode;
int base = heap[0].heuristic;
// O(E * hash)
while (true) {
if (heap[0] == null) {
if (heap[1] == heap[2]) break;
heap[0] = heap[1]; heap[1] = heap[2]; heap[2] = null; base++;
continue;
}
// If the lowest cost isn't at least 1 less than the current cost for the destination,
// it can't improve the best path to the destination.
if (base >= endNode.cost - 1) break;
// Get the cheapest node from the heap.
Node v0 = heap[0];
heap[0] = v0.remove();
if (heap[0] == v0) heap[0] = null;
// Relax the edges from v0.
int g_v0 = v0.cost;
// O(hash * #neighbours)
for (String v1Str : wordsToWords.get(v0.key))
{
Node v1 = nodes.get(v1Str);
if (v1 == null) {
v1 = new Node(v1Str, dest);
nodes.put(v1Str, v1);
}
// If it's an improvement, use it.
if (g_v0 + 1 < v1.cost)
{
// Update the heap.
if (v1.cost < Node.INFINITY)
{
int bucket = v1.cost + v1.heuristic - base;
Node t = v1.remove();
if (heap[bucket] == v1) heap[bucket] = t == v1 ? null : t;
}
// Next update the backpointer and the costs map.
v1.backpointer = v0;
v1.cost = g_v0 + 1;
int bucket = v1.cost + v1.heuristic - base;
if (heap[bucket] == null) {
heap[bucket] = v1;
}
else {
v1.next = heap[bucket];
v1.prev = v1.next.prev;
v1.next.prev = v1.prev.next = v1;
}
}
}
}
if (endNode.backpointer == null) {
System.out.println(start);
System.out.println(dest);
System.out.println("OY");
}
else {
String[] path = new String[endNode.cost + 1];
Node t = endNode;
for (int i = t.cost; i >= 0; i--) {
path[i] = t.key;
t = t.backpointer;
}
for (String str : path) System.out.println(str);
System.out.println(path.length - 2);
}
}
private static <K, V> void add(Map<K, Set<V>> map, K key, V value) {
Set<V> vals = map.get(key);
if (vals == null) map.put(key, vals = new HashSet<V>());
vals.add(value);
}
private static class Node
{
public static int INFINITY = Integer.MAX_VALUE >> 1;
public String key;
public int cost;
public int heuristic;
public Node backpointer;
public Node prev = this;
public Node next = this;
public Node(String key, String dest) {
this.key = key;
cost = INFINITY;
for (int i = 0; i < dest.length(); i++) if (dest.charAt(i) != key.charAt(i)) heuristic++;
}
public Node remove() {
Node rv = next;
next.prev = prev;
prev.next = next;
next = prev = this;
return rv;
}
}
}
As you can see, the running cost analysis is O(filelength + num_words * hash + V * n * (n + hash) + E * hash)
. If you'll accept my assumption that a hash table insertion/lookup is constant time, that's O(filelength + V n^2 + E)
. The particular statistics of the graphs in SOWPODS mean that O(V n^2)
really dominates O(E)
for most n
.
Sample outputs:
IDOLA, IDOLS, IDYLS, ODYLS, ODALS, OVALS, OVELS, OVENS, EVENS, ETENS, STENS, SKENS, SKINS, SPINS, SPINE, 13
WICCA, PROSY, OY
BRINY, BRINS, TRINS, TAINS, TARNS, YARNS, YAWNS, YAWPS, YAPPS, 7
GALES, GASES, GASTS, GESTS, GESTE, GESSE, DESSE, 5
SURES, DURES, DUNES, DINES, DINGS, DINGY, 4
LICHT, LIGHT, BIGHT, BIGOT, BIGOS, BIROS, GIROS, GIRNS, GURNS, GUANS, GUANA, RUANA, 10
SARGE, SERGE, SERRE, SERRS, SEERS, DEERS, DYERS, OYERS, OVERS, OVELS, OVALS, ODALS, ODYLS, IDYLS, 12
KEIRS, SEIRS, SEERS, BEERS, BRERS, BRERE, BREME, CREME, CREPE, 7
This is one of the 6 pairs with the longest shortest path:
GAINEST, FAINEST, FAIREST, SAIREST, SAIDEST, SADDEST, MADDEST, MIDDEST, MILDEST, WILDEST, WILIEST, WALIEST, WANIEST, CANIEST, CANTEST, CONTEST, CONFEST, CONFESS, CONFERS, CONKERS, COOKERS, COOPERS, COPPERS, POPPERS, POPPETS, POPPITS, POPPIES, POPSIES, MOPSIES, MOUSIES, MOUSSES, POUSSES, PLUSSES, PLISSES, PRISSES, PRESSES, PREASES, UREASES, UNEASES, UNCASES, UNCASED, UNBASED, UNBATED, UNMATED, UNMETED, UNMEWED, ENMEWED, ENDEWED, INDEWED, INDEXED, INDEXES, INDENES, INDENTS, INCENTS, INCESTS, INFESTS, INFECTS, INJECTS, 56
And one of the worst-case soluble 8-letter pairs:
ENROBING, UNROBING, UNROPING, UNCOPING, UNCAPING, UNCAGING, ENCAGING, ENRAGING, ENRACING, ENLACING, UNLACING, UNLAYING, UPLAYING, SPLAYING, SPRAYING, STRAYING, STROYING, STROKING, STOOKING, STOOPING, STOMPING, STUMPING, SLUMPING, CLUMPING, CRUMPING, CRIMPING, CRISPING, CRISPINS, CRISPENS, CRISPERS, CRIMPERS, CRAMPERS, CLAMPERS, CLASPERS, CLASHERS, SLASHERS, SLATHERS, SLITHERS, SMITHERS, SMOTHERS, SOOTHERS, SOUTHERS, MOUTHERS, MOUCHERS, COUCHERS, COACHERS, POACHERS, POTCHERS, PUTCHERS, PUNCHERS, LUNCHERS, LYNCHERS, LYNCHETS, LINCHETS, 52
Now that I think I've got all the requirements of the question out of the way, my discussion.
For a CompSci the question obviously reduces to shortest path in a graph G whose vertices are words and whose edges connect words differing in one letter. Generating the graph efficiently isn't trivial - I actually have an idea I need to revisit to reduce the complexity to O(V n hash + E). The way I do it involves creating a graph which inserts extra vertices (corresponding to words with one wildcard character) and is homeomorphic to the graph in question. I did consider using that graph rather than reducing to G - and I suppose that from a golfing point of view I should have done - on the basis that a wildcard node with more than 3 edges reduces the number of edges in the graph, and the standard worst case running time of shortest path algorithms is O(V heap-op + E)
.
However, the first thing I did was to run some analyses of the graphs G for different word lengths, and I discovered that they're extremely sparse for words of 5 or more letters. The 5-letter graph has 12478 vertices and 40759 edges; adding link nodes makes the graph worse. By the time you're up to 8 letters there are fewer edges than nodes, and 3/7 of the words are "aloof". So I rejected that optimisation idea as not really helpful.
The idea which did prove helpful was to examine the heap. I can honestly say that I've implemented some moderately exotic heaps in the past, but none as exotic as this. I use A-star (since C provides no benefit given the heap I'm using) with the obvious heuristic of number of letters different from the target, and a bit of analysis shows that at any time there are no more than 3 different priorities in the heap. When I pop a node whose priority is (cost + heuristic) and look at its neighbours, there are three cases I'm considering: 1) neighbour's cost is cost+1; neighbour's heuristic is heuristic-1 (because the letter it changes becomes "correct"); 2) cost+1 and heuristic+0 (because the letter it changes goes from "wrong" to "still wrong"; 3) cost+1 and heuristic+1 (because the letter it changes goes from "correct" to "wrong"). So if I relax the neighbour I'm going to insert it at the same priority, priority+1, or priority+2. As a result I can use a 3-element array of linked lists for the heap.
I should add a note about my assumption that hash lookups are constant. Very well, you may say, but what about hash computations? The answer is that I'm amortising them away: java.lang.String
caches its hashCode()
, so the total time spent computing hashes is O(V n^2)
(in generating the graph).
There is another change which affects the complexity, but the question of whether it is an optimisation or not depends on your assumptions about statistics. (IMO putting "the best Big O solution" as a criterion is a mistake because there isn't a best complexity, for a simple reason: there isn't a single variable). This change affects the graph generation step. In the code above, it's:
Map<String, Set<String>> wordsToLinks = new HashMap<String, Set<String>>();
Map<String, Set<String>> linksToWords = new HashMap<String, Set<String>>();
// Cost: O(Vn * (n + hash))
for (String word : words)
{
// Cost: O(n*(n + hash))
for (int i = 0; i < word.length(); i++)
{
// Cost: O(n + hash)
char[] ch = word.toCharArray();
ch[i] = '.';
String link = new String(ch).intern();
add(wordsToLinks, word, link);
add(linksToWords, link, word);
}
}
// Cost: O(V * n * hash + E * hash)
for (Map.Entry<String, Set<String>> from : wordsToLinks.entrySet()) {
String src = from.getKey();
wordsToWords.put(src, new HashSet<String>());
for (String link : from.getValue()) {
Set<String> to = linksToWords.get(link);
for (String snk : to) {
// Note: equality test is safe here. Cost is O(hash)
if (snk != src) add(wordsToWords, src, snk);
}
}
}
That's O(V * n * (n + hash) + E * hash)
. But the O(V * n^2)
part comes from generating a new n-character string for each link and then computing its hashcode. This can be avoided with a helper class:
private static class Link
{
private String str;
private int hash;
private int missingIdx;
public Link(String str, int hash, int missingIdx) {
this.str = str;
this.hash = hash;
this.missingIdx = missingIdx;
}
@Override
public int hashCode() { return hash; }
@Override
public boolean equals(Object obj) {
Link l = (Link)obj; // Unsafe, but I know the contexts where I'm using this class...
if (this == l) return true; // Essential
if (hash != l.hash || missingIdx != l.missingIdx) return false;
for (int i = 0; i < str.length(); i++) {
if (i != missingIdx && str.charAt(i) != l.str.charAt(i)) return false;
}
return true;
}
}
Then the first half of the graph generation becomes
Map<String, Set<Link>> wordsToLinks = new HashMap<String, Set<Link>>();
Map<Link, Set<String>> linksToWords = new HashMap<Link, Set<String>>();
// Cost: O(V * n * hash)
for (String word : words)
{
// apidoc: The hash code for a String object is computed as
// s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1]
// Cost: O(n * hash)
int hashCode = word.hashCode();
int pow = 1;
for (int j = word.length() - 1; j >= 0; j--) {
Link link = new Link(word, hashCode - word.charAt(j) * pow, j);
add(wordsToLinks, word, link);
add(linksToWords, link, word);
pow *= 31;
}
}
By using the structure of the hashcode we can generate the links in O(V * n)
. However, this has a knock-on effect. Inherent in my assumption that hash lookups are constant time is an assumption that comparing objects for equality is cheap. However, the equality test of Link is O(n)
in the worst case. The worst case is when we have a hash collision between two equal links generated from different words - i.e. it occurs O(E)
times in the second half of the graph generation. Other than that, except in the unlikely event of a hash collision between non-equal links, we're good. So we've traded in O(V * n^2)
for O(E * n * hash)
. See my previous point about statistics.
This reduces itself to the Six Degrees of Wikipedia problem. While a heuristic can be developed based upon the Levenshtein Distance of the source and target, the most correct approach for single source, single destination appears to be a breadth-first search based upon the connectivity of all words in, for example, the Oxford English dictionary. Extra credit: because OED is small, run Floyd-Warshall to create a lookup table of all solutions when caching is desired. :)
– MrGomez – 2012-05-16T19:37:58.813Do the intermediate words have to be the same length as the start and end word? – Peter Taylor – 2011-05-05T18:30:20.480
@PeterTaylor, yes - I'll edit that clarification in. – Rebecca Chernoff – 2011-05-05T18:32:23.800
I would have guessed that if only one letter can be changed per step that that is implied ;) – Joey – 2011-05-05T18:36:24.373
@Joey, often puzzles of this kind allow the change to be a deletion or insertion. – Peter Taylor – 2011-05-05T18:41:28.780
@Rebecca, does your dictionary not have
GORSE
?HOUSE HORSE GORSE GORGE
is optimal. – Peter Taylor – 2011-05-05T18:43:14.317Peter: I guess optimality will be judged from the word list. – Joey – 2011-05-05T18:45:55.443
@PeterTaylor, that's what I get for copying from a random website I found a few examples from. |: – Rebecca Chernoff – 2011-05-05T18:47:39.837
4It seems strange to me that performing 3 operations to get from
HOUSE
toGORGE
is reported as 2. I realize there are 2 intermediate words, so it does make sense, but # of operations would be more intuitive. – Matthew Read – 2011-05-05T19:05:14.983@Rebecca, can we assume that the alphabet is A-Z and that the longest words required are no longer than 13 letters? (I don't think it makes much difference to the complexity, which will be dominated by the number of edges in the graph in the worst case, but it allows some convenient hackery). – Peter Taylor – 2011-05-05T19:05:23.597
4@Peter, according to sowpods wikipedia page there are ~15k words longer than 13 letters – gnibbler – 2011-05-06T00:53:39.097
4
I don't mean to be a know it all, but the puzzle actually has a Name, it was invented by Lewis Carroll http://en.wikipedia.org/wiki/Word_ladder
– st0le – 2011-05-06T04:11:10.0931You have an undecidable goal in the question:
The fastest/best Big O solution producing the shortest interim steps after one week will win.
Since you can't guarantee, that the fastest solution is meanwhile the one, which uses the fewest steps, you should provide a preference, if one solution uses fewer steps, but reaches the goal later. – user unknown – 2011-05-06T04:24:32.3132I just want to confirm
BAT
andCAT
will have zero steps, right? – st0le – 2011-05-06T05:24:23.347@gnibbler, there are longer words but they're boring. There are 9116 14-letter words, so 41546170 pairs of 14-letter words, and only 2258 of those pairs have a solution. If you pick a random pair of 7-letter words the chances of a solution are about 1 in 7. With 8-letter words it's about 1 in 240, and then it gets worse. So I don't think it unreasonable to ask whether we can neglect words longer than a threshold. – Peter Taylor – 2011-05-06T18:56:14.183
If anyone wants a nasty test case with SOWPODS, try ENROBING LINCHETS. – Peter Taylor – 2011-05-06T22:27:58.853
@userunknown, using the shortest path possible is a basic part of the game. – Rebecca Chernoff – 2011-05-07T23:46:43.763
@Rebecca: I see - shortest path is mandatory, to be a solution at all. Thanks. – user unknown – 2011-05-08T00:28:01.557