C# - 30 Seconds
A different approach from most if I read right - I don't use any hash based structures.
I tend to get no results, not sure if this a statistical anomaly, or error in my reasoning. Fixed, comparison for binary sort was flawed.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
namespace FilterFile
{
class Program
{
const int COUNT = 50000000;
static string inputFile = "data" + COUNT + ".txt";
static string outputFile = "results.txt";
static void Main(string[] args)
{
Console.WriteLine("Prepping Test");
if (args.Length > 0) inputFile = args[0];
if (args.Length > 1) outputFile = args[1];
if (!File.Exists(inputFile))
{
Console.WriteLine(inputFile);
File.WriteAllLines(inputFile,
GenerateData(COUNT)
.Select(r => string.Format("{0} {1} {2}", r.A, r.B, r.C)));
}
File.Delete("results.txt");
Console.WriteLine("Starting Test \n\n");
using (Timer.Create("Total Time"))
{
Row[] sortedA, sortedB;
//http://codegolf.stackexchange.com/questions/26643/filter-a-large-file-quickly
using (Timer.Create("Reading Data"))
FillData(out sortedA, out sortedB);
using (Timer.Create("Parallel Sort A"))
ParallelSort.QuicksortParallel(sortedA);
using (Timer.Create("Parallel Sort B"))
ParallelSort.QuicksortParallel(sortedB, (x, y) => x.B - y.B);
object rLock = new object();
List<Row> results = new List<Row>();
var comparison = Comparer<Row>.Create((B, A) => B.B - A.A);
using (Timer.Create("Compute Results"))
Parallel.ForEach(sortedA, row =>
//foreach (var row in sortedA)
{
var i = Array.BinarySearch(sortedB, row, comparison);
if (i < 0) return;
Row other;
bool solved = false;
for (var tempI = i; tempI < sortedB.Length && row.A == (other = sortedB[tempI]).B; tempI++)
{
var diff = row.C - other.C;
if (diff >= 0 && diff < 100)
{
lock (rLock) results.Add(row);
return;
}
}
for (var tempI = i - 1; tempI >= 0 && row.A == (other = sortedB[tempI]).B; tempI--)
{
var diff = row.C - other.C;
if (diff >= 0 && diff < 100)
{
lock (rLock) results.Add(row);
return;
}
}
});
using (Timer.Create("Save Results"))
{
File.WriteAllLines(outputFile, results.Select(r => r.ToString()));
}
}
}
private static void FillData(out Row[] sortedA, out Row[] sortedB)
{
var tempA = new Row[COUNT];
var tempB = tempA;//new Row[COUNT];
const int PARTITION_SIZE = 1 << 22;
ReadAndSort(tempA, tempB, PARTITION_SIZE);
sortedA = tempA;
sortedB = new Row[COUNT];
Array.Copy(sortedA, sortedB, COUNT);
/*using (Timer.Create("MergeA"))
{
int destIndex = 0;
int[][] partitions = Enumerable.Range(0, COUNT / PARTITION_SIZE + 1)
.Select(i => new[] { i * PARTITION_SIZE, Math.Min(i * PARTITION_SIZE + PARTITION_SIZE, COUNT) - 1 })
.ToArray();
for (int i = 0; i < COUNT; i++)
{
foreach (var partition in partitions)
{
while (partition[0] <= partition[1] && tempA[partition[0]].A == i)
{
sortedA[destIndex++] = tempA[partition[0]++];
}
}
}
}*/
/*//Verify Paritioning Works
var results = new List<Tuple<Row, int>> { Tuple.Create(tempA[0], 0) };
for (int i = 1; i < tempA.Length; i++)
{
var r = tempA[i];
if (r.A < tempA[i-1].A)
results.Add(Tuple.Create(r, i % PARTITION_SIZE));
}
results.ForEach(t => Console.WriteLine(t.Item1 + " " + t.Item2));*/
}
private static void ReadAndSort(Row[] tempA, Row[] tempB, int PARTITION_SIZE)
{
List<Task> tasks = new List<Task>();
using (var stream = File.OpenRead(inputFile))
{
int b;
int tempMember = 0;
int memberIndex = 0;
int elementIndex = 0;
using (Timer.Create("Read From Disk"))
while ((b = stream.ReadByte()) >= 0)
{
switch (b)
{
case (byte)'\r':
case (byte)' ':
switch (memberIndex)
{
case 0: tempA[elementIndex].A = tempMember; memberIndex = 1; break;
case 1: tempA[elementIndex].B = tempMember; memberIndex = 2; break;
case 2: tempA[elementIndex].C = tempMember; memberIndex = 0; break;
}
tempMember = 0;
break;
case (byte)'\n':
/*if (elementIndex % PARTITION_SIZE == 0 && elementIndex > 0)
{
var copiedIndex = elementIndex;
tasks.Add(Task.Run(() =>
{
var startIndex = copiedIndex - PARTITION_SIZE;
Array.Copy(tempA, startIndex, tempB, startIndex, PARTITION_SIZE);
ParallelSort.QuicksortSequentialInPlace(tempA, startIndex, copiedIndex - 1);
ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, copiedIndex - 1, (x, y) => x.B - y.B);
}));
}*/
elementIndex++;
break;
default:
tempMember = tempMember * 10 + b - '0';
break;
}
}
/* tasks.Add(Task.Run(() =>
{
elementIndex--; //forget about the last \n
var startIndex = (elementIndex / PARTITION_SIZE) * PARTITION_SIZE;
Array.Copy(tempA, startIndex, tempB, startIndex, elementIndex - startIndex + 1);
ParallelSort.QuicksortParallelInPlace(tempA, startIndex, elementIndex);
ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, elementIndex, (x, y) => x.B - y.B);
}));
using (Timer.Create("WaitForSortingToFinish"))
Task.WaitAll(tasks.ToArray());*/
}
}
static Random rand = new Random();
public struct Row : IComparable<Row>
{
public int A;
public int B;
public int C;
public static Row RandomRow(int count)
{
return new Row { A = rand.Next(count), B = rand.Next(count), C = rand.Next(count) };
}
public int CompareTo(Row other)
{
return A - other.A;
}
public override string ToString()
{
return string.Format("{0} {1} {2}", A, B, C);
}
}
public static Row[] GenerateData(int count)
{
var data = new Row[count];
for (int i = 0; i < count; i++)
data[i] = Row.RandomRow(count);
return data;
}
public static Row[] GenerateSplitData(int count)
{
var data = new Row[count];
for (int i = 0; i < count; i++)
data[i] = Row.RandomRow(count);
return data;
}
public class Timer : IDisposable
{
string message;
Stopwatch sw;
public static Timer Create(string message)
{
Console.WriteLine("Started: " + message);
var t = new Timer();
t.message = message;
t.sw = Stopwatch.StartNew();
return t;
}
public void Dispose()
{
Console.WriteLine("Finished: " + message + " in " + sw.ElapsedMilliseconds + "ms");
}
}
// <summary>
/// Parallel quicksort algorithm.
/// </summary>
public class ParallelSort
{
const int SEQUENTIAL_THRESHOLD = 4096;
#region Public Static Methods
/// <summary>
/// Sequential quicksort.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortSequential<T>(T[] arr) where T : IComparable<T>
{
QuicksortSequentialInPlace(arr, 0, arr.Length - 1);
}
/// <summary>
/// Parallel quicksort
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortParallel<T>(T[] arr) where T : IComparable<T>
{
QuicksortParallelInPlace(arr, 0, arr.Length - 1);
}
#endregion
#region Private Static Methods
public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
if (right > left)
{
int pivot = Partition(arr, left, right);
QuicksortSequentialInPlace(arr, left, pivot - 1);
QuicksortSequentialInPlace(arr, pivot + 1, right);
}
}
public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right)
where T : IComparable<T>
{
if (right > left)
{
if (right - left < SEQUENTIAL_THRESHOLD)
QuicksortSequentialInPlace(arr, left, right);
else
{
int pivot = Partition(arr, left, right);
Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1),
() => QuicksortParallelInPlace(arr, pivot + 1, right));
}
}
}
private static void Swap<T>(T[] arr, int i, int j)
{
T tmp = arr[i];
arr[i] = arr[j];
arr[j] = tmp;
}
private static int Partition<T>(T[] arr, int low, int high)
where T : IComparable<T>
{
// Simple partitioning implementation
int pivotPos = (high + low) / 2;
T pivot = arr[pivotPos];
Swap(arr, low, pivotPos);
int left = low;
for (int i = low + 1; i <= high; i++)
{
if (arr[i].CompareTo(pivot) < 0)
{
left++;
Swap(arr, i, left);
}
}
Swap(arr, low, left);
return left;
}
#endregion
#region Public Static Methods
/// <summary>
/// Sequential quicksort.
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortSequential<T>(T[] arr, Func<T, T, int> comparer)
{
QuicksortSequentialInPlace(arr, 0, arr.Length - 1, comparer);
}
/// <summary>
/// Parallel quicksort
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="arr"></param>
public static void QuicksortParallel<T>(T[] arr, Func<T, T, int> comparer)
{
QuicksortParallelInPlace(arr, 0, arr.Length - 1, comparer);
}
#endregion
#region Private Static Methods
public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
{
if (right > left)
{
int pivot = Partition(arr, left, right, comparer);
QuicksortSequentialInPlace(arr, left, pivot - 1, comparer);
QuicksortSequentialInPlace(arr, pivot + 1, right, comparer);
}
}
public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
{
if (right > left)
{
if (right - left < SEQUENTIAL_THRESHOLD)
{
QuicksortSequentialInPlace(arr, left, right, comparer);
}
else
{
int pivot = Partition(arr, left, right, comparer);
Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1, comparer),
() => QuicksortParallelInPlace(arr, pivot + 1, right, comparer));
}
}
}
private static int Partition<T>(T[] arr, int low, int high, Func<T, T, int> comparer)
{
// Simple partitioning implementation
int pivotPos = (high + low) / 2;
T pivot = arr[pivotPos];
Swap(arr, low, pivotPos);
int left = low;
for (int i = low + 1; i <= high; i++)
{
if (comparer(arr[i], pivot) < 0)
{
left++;
Swap(arr, i, left);
}
}
Swap(arr, low, left);
return left;
}
#endregion
}
}
}
1Maybe you should try writing a challenge that isn't [fastest-code]. – Justin – 2014-05-05T20:52:15.883
1
@Quincunx http://codegolf.stackexchange.com/questions/26520/add-two-numbers-safely-in-c :)
– None – 2014-05-05T20:57:13.743Why did you strip the user and sys time for wc? These are relevant because they show how much of the time elapsed was io and how much CPU (iow, was your cache primed when you ran your time wc invocation?) – skibrianski – 2014-05-05T22:45:05.627
@skibrianski Fixed. – None – 2014-05-06T08:18:27.663
2Please define positive integer.
1 < n < 2147483647
? – durron597 – 2014-05-07T21:01:54.470Oh, or reading your sample, the numbers are bounded by the number of lines in the file? – durron597 – 2014-05-07T21:04:18.113
Does the order of the output lines matter? Also, should the lines be output as a pair or can we just show the lines that match either condition in arbitrary order? – skibrianski – 2014-05-08T01:03:34.947
@skibrianski The order of the output lines is not important. The output lines should simply be a subset of the input lines so yes you should output the lines that match either condition in arbitrary order. – None – 2014-05-08T05:51:50.890
@durron597 Yes I will only test on files that are made using my sample python code. – None – 2014-05-08T05:52:20.517
If a line has multiple matches, can it be printed multiple times? (Yes would be good for parallelizing the problem and paralyzing your computer.) – Scott Leadley – 2014-05-09T06:55:17.617
1@ScottLeadley No, unless it appeared multiple times in the input of course (which I think is very unlikely). – None – 2014-05-09T07:32:33.827
Please define what a positive integer is. Also, what does "either" mean in your question. Does it mean at least one condition, or does it mean one condition, but not both? – Alexandre Santos – 2014-05-11T07:34:04.160
@AlexandreSantos "either" means at least one condition. The easiest way to understand what a positive integer is is to see the sample python code. – None – 2014-05-11T08:00:50.330
Have you got any tool for checking the correctness of the output? – James_pic – 2014-05-11T08:50:04.270
1What graphics card do you have and how much video memory does it have? – IchBinKeinBaum – 2014-05-11T16:57:19.887
@IchBinKeinBaum VGA compatible controller: Advanced Micro Devices, Inc. [AMD/ATI] RS880 [Radeon HD 4250] – None – 2014-05-12T18:43:35.873
@James_pic I am just comparing to my own slow code and the various submissions to each other. – None – 2014-05-12T18:43:58.827
I don't get deterministic output from the python script. Must solutions work for all possible outputs? Or should there be a constant seed value? – laindir – 2014-05-12T19:25:52.560
@laindir All possible outputs. – None – 2014-05-12T19:26:18.447
My script is IO bound, so what storage does your system have and will the file be cached in memory first or read directly from disk? – NPSF3000 – 2014-05-12T23:19:47.617
1@NPSF3000 I run
sync && sudo bash -c 'echo 3 > /proc/sys/vm/drop_caches'
before each test. I have a standard hard disk drive. – None – 2014-05-13T08:18:15.093Thanks, my version should now be accpeting of cmd line arguments (inputFileName outPutFileName) – NPSF3000 – 2014-05-13T12:52:26.733