PYTHON
Below is a self-matching regex generator. You provide two lists, one contains training data that the regex should match (in addition to matching itself), the other contains training data the the regex should NOT match:
from random import choice, randrange
import re
from itertools import zip_longest, chain, islice
from operator import itemgetter
CHAR_SET = [chr(i) for i in range(128)] + [r"\\", r"\d", r"\D",
r"\w", r"\W", r"\s",
r"\S", r"?:", r"\1",
r"\2", r"\A", r"\b",
r"\B", r"\Z", r"\.",
r"\[", r"\]", r"\(",
r"\)", r"\{", r"\}",
r"\+", r"\|", r"\?",
r"\*"]
CHAR_SAMPLE = []
BREAKPOINT = re.compile(
r"""
\(.*?\)|
\[.*?\]|
\{.*?\}|
\w+(?=[\(\[\{])?|
\S+?|
\.\*\??|
\.\+\??|
\.\?\??|
\\.|
.*?
""",
re.VERBOSE)
MATCH_BRACKETS = {'(': ')', '[': ']', '{': '}'}
CLOSE_BRACKETS = {')', ']', '}'}
REGEX_SEEDER = [
r".*?",
r"(?:.*?)",
r"\w|\s",
r"(?<.*?)",
r"(?=.*?)",
r"(?!.*?)",
r"(?<=.*?)",
r"(?<!.*?)",
]
LEN_LIMIT = 100
def distribute(distribution):
global CHAR_SAMPLE
for item in CHAR_SET:
if item in distribution:
CHAR_SAMPLE.extend([item] * distribution[item])
else:
CHAR_SAMPLE.append(item)
def rand_index(seq, stop=None):
if stop is None:
stop = len(seq)
try:
return randrange(0, stop)
except ValueError:
return 0
def rand_slice(seq):
try:
start = randrange(0, len(seq))
stop = randrange(start, len(seq))
return slice(start, stop)
except ValueError:
return slice(0, 0)
#Mutation Functions
def replace(seq):
seq[rand_index(seq)] = choice(CHAR_SAMPLE)
def delete(seq):
del seq[rand_index(seq)]
def insert(seq):
seq.insert(rand_index(seq, len(seq) + 1), choice(CHAR_SAMPLE))
def duplicate(seq):
source = rand_slice(seq)
seq[source.stop: source.stop] = seq[source]
def swap(seq):
if len(seq) < 2: return
a = rand_index(seq, len(seq) - 1)
seq[a], seq[a + 1] = seq[a + 1], seq[a]
dummy = lambda seq: None
MUTATE = (
replace,
delete,
insert,
duplicate,
swap,
dummy,
dummy,
)
def repair_brackets(seq):
"""Attempts to lower the percentage of invalid regexes by
matching orphaned brackets"""
p_stack, new_seq = [], []
for item in seq:
if item in MATCH_BRACKETS:
p_stack.append(item)
elif item in CLOSE_BRACKETS:
while p_stack and MATCH_BRACKETS[p_stack[-1]] != item:
new_seq.append(MATCH_BRACKETS[p_stack[-1]])
p_stack.pop()
if not p_stack:
continue
else:
p_stack.pop()
new_seq.append(item)
while p_stack:
new_seq.append(MATCH_BRACKETS[p_stack.pop()])
return new_seq
def compress(seq):
new_seq = [seq[0]]
last_match = seq[0]
repeat = 1
for item in islice(seq, 1, len(seq)):
if item == last_match:
repeat += 1
else:
if repeat > 1:
new_seq.extend(list("{{{0}}}".format(repeat)))
new_seq.append(item)
last_match = item
repeat = 1
else:
if repeat > 1:
new_seq.extend(list("{{{0}}}".format(repeat)))
return new_seq
def mutate(seq):
"""Random in-place mutation of sequence"""
if len(seq) > LEN_LIMIT:
seq[:] = seq[:LEN_LIMIT]
c = choice(MUTATE)
c(seq)
def crossover(seqA, seqB):
"""Recombination of two sequences at optimal breakpoints
along each regex strand"""
bpA = [item.start() for item in BREAKPOINT.finditer(''.join(seqA))]
bpB = [item.start() for item in BREAKPOINT.finditer(''.join(seqA))]
slObjA = (slice(*item) for item in zip(bpA, bpA[1:]))
slObjB = (slice(*item) for item in zip(bpB, bpB[1:]))
slices = zip_longest(
(seqA[item] for item in slObjA),
(seqB[item] for item in slObjB),
fillvalue=[]
)
recombinant = (choice(item) for item in slices)
return list(chain.from_iterable(recombinant))
#Fitness testing
def match_percentage(match):
"""Calculates the percentage a text actually matched
by a regular expression"""
if match and match.endpos:
return (match.end() - match.start()) / match.endpos
else:
return 0.001
def fitness_test(seq, pos_matches, neg_matches):
"""Scoring algorithm to determine regex fitness"""
try:
self_str = ''.join(seq)
regex = re.compile(self_str)
except (re.error, IndexError):
seq[:] = repair_brackets(seq)
try:
self_str = ''.join(seq)
regex = re.compile(self_str)
except (re.error, IndexError):
return 0.001
pos_score = sum(match_percentage(regex.search(item))
for item in pos_matches) / len(pos_matches) / 3
neg_score = (1 - sum(match_percentage(regex.search(item))
for item in neg_matches) / len(neg_matches)) / 3
self_score = match_percentage(regex.search(self_str)) / 3
return pos_score + self_score + neg_score
#Population Management
def generate_pop(pos_matches, neg_matches, pop_size):
sources = (pos_matches, REGEX_SEEDER)
return [crossover(
choice(choice(sources)), choice(choice(sources))
) for i in range(pop_size)]
def glean_pop(population, cutoff, fit_test, ft_args=()):
scores = (fit_test(bug, *ft_args) for bug in population)
ranked = sorted(zip(population, scores), key=itemgetter(1), reverse=True)
maxItem = ranked[0]
new_pop = next(zip(*ranked))[:cutoff]
return maxItem, new_pop
def repopulate(population, pop_size):
cutoff = len(population)
for i in range(pop_size // cutoff):
population.extend([crossover(choice(population), choice(population))
for i in range(cutoff)])
population.extend([population[i][:] for i in range(pop_size - len(population))])
#Simulator
def simulate(pos_matches, neg_matches, pop_size=50, cutoff=10, threshold=1.0):
population = generate_pop(pos_matches, neg_matches, pop_size)
while True:
for bug in population:
mutate(bug)
#Scoring step
max_item, population = glean_pop(
population,
cutoff,
fitness_test,
(pos_matches, neg_matches)
)
#Exit condition:
max_regex, max_score = max_item
if max_score >= threshold:
return max_score, max_regex
"""
print(max_score, ''.join(max_regex))
input("next?")"""
#Repopulation Step:
population = list(population)
repopulate(population, pop_size)
6
There was a question on SO a while ago, whether there is a regex that matches valid regexes: http://stackoverflow.com/questions/172303/is-there-a-regular-expression-to-detect-a-valid-regular-expression
– Patrick Oscity – 2012-07-27T20:13:08.340@padde Very interesting! :) In the comments, they said that regex doesn't match itself, but it apparently matches all non-recursive regex. Too bad--that would likely be a winning entry (matching itself and all other entries). – Casey Kuball – 2012-07-27T22:57:39.560
5Define non-trivial. I mean, OK,
A
would be trivial, but where do you draw the line? And by "self-matching" do you mean it can only match itself, or is it allowed to match other strings too? Would.
qualify? – Mr Lister – 2012-07-28T13:58:34.640@MrLister Non-trivial would be decided by voters. Self-matching means that it matches itself, but part of it being useful/practical means that it matches other things too.
.
qualifies as a self-matching regex, but that's also pretty trivial. So isABC123Hello World
. – Casey Kuball – 2012-07-28T15:48:31.2131@padde which actually wasn't a regex because the grammar that describes regular expression is context-free. – FUZxxl – 2012-07-29T11:39:47.660
1@FUZxxl yes, that is true but one could still write a regex that matches other regexes, but doesnt care about the validity of the matched regexes. – Patrick Oscity – 2012-07-29T13:20:12.980
1@padde Well, what is an invalid regex then? An invalid regex obviously isn't a regex. So you essentially say: "yes, that is true but one could still write a regex that matches other regexes, but doesnt care if the matched regex really is a regex" (sic!) – FUZxxl – 2012-07-29T16:19:02.160
@FUZxxl, "regex" and "regular expression" parted ways many years ago. Now the relationships between them are i) an inclusion; and ii) an etymological link. – Peter Taylor – 2012-08-02T17:22:04.433
What does the winner get? – Inkbug – 2012-08-03T11:33:40.630
/.+/
Matches itself and can be used to determine if a string is empty or not. – Shmiddty – 2014-05-30T22:08:10.000