1 /*
2 This module implements the Mildew RegExp class. See
3 https://pillager86.github.io/dmildew/RegExp.html
4 
5 ────────────────────────────────────────────────────────────────────────────────
6 
7 Copyright (C) 2021 pillager86.rf.gd
8 
9 This program is free software: you can redistribute it and/or modify it under 
10 the terms of the GNU General Public License as published by the Free Software 
11 Foundation, either version 3 of the License, or (at your option) any later 
12 version.
13 
14 This program is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
16 PARTICULAR PURPOSE.  See the GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License along with 
19 this program.  If not, see <https://www.gnu.org/licenses/>.
20 */
21 module mildew.stdlib.regexp;
22 
23 static import std.regex;
24 
25 import mildew.environment;
26 import mildew.interpreter;
27 import mildew.types;
28 
29 /**
30  * Class that encapsulates regular expressions. The D struct Regex cannot be directly stored in a ScriptObject
31  */
32 class ScriptRegExp
33 {
34 public:
35     /// ctor
36     this(in string pattern, in string flags="")
37     {
38         _regex = std.regex.regex(pattern, flags);
39 
40         _source = pattern;
41 
42         char[] unsortedFlags = flags.dup;
43         if(flags.length > 0)
44         {
45             for(size_t i = 0; i < unsortedFlags.length - 1; ++i)
46             {
47                 for(size_t j = 0; j < unsortedFlags.length - i - 1; ++j)
48                 {
49                     if(unsortedFlags[j] > unsortedFlags[j+1])
50                     {
51                         immutable swap = unsortedFlags[j];
52                         unsortedFlags[j] = unsortedFlags[j+1];
53                         unsortedFlags[j+1] = swap;
54                     }
55                 }
56             }
57         }
58         _flags = cast(string)unsortedFlags;
59     }
60 
61     /// flags property
62     string flags() const { return _flags; }
63 
64     /// last index property
65     size_t lastIndex() const { return _lastIndex; }
66     /// last index property
67     size_t lastIndex(size_t li)
68     {
69         return _lastIndex = li;
70     }
71 
72     /// source property
73     string source() const { return _source; }
74 
75     /// whether or not 's' flag was used
76     bool dotAll() const 
77     {
78         foreach(ch ; _flags)
79             if(ch == 's') return true;
80         return false;
81     }
82 
83     /// whether or not 'g' flag was used
84     bool global() const 
85     {
86         foreach(ch ; _flags)
87             if(ch == 'g') return true;
88         return false;
89     }
90 
91     /// whether or not 'i' flag was used
92     bool ignoreCase() const
93     {
94         foreach(ch ; _flags)
95             if(ch == 'i') return true;
96         return false;
97     }
98 
99     /// whether or not 'm' flag was used
100     bool multiline() const 
101     {
102         foreach(ch ; _flags)
103             if(ch == 'm') return true;
104         return false;
105     }
106 
107     /// returns match
108     auto match(string str)
109     {
110         auto m = std.regex.match(str, _regex);
111         string[] result;
112         foreach(mat ; m)
113             result ~= mat.hit;
114         return result;
115     }
116 
117     /// matchAll - The Script will implement this as an iterator once generators are a thing
118     auto matchAll(string str)
119     {
120         auto m = std.regex.matchAll(str, _regex);
121         return m;
122     }
123 
124     /// replace
125     auto replace(string str, string fmt)
126     {
127         if(global)
128             return std.regex.replaceAll(str, _regex, fmt);
129         else
130             return std.regex.replaceFirst(str, _regex, fmt);
131     }
132 
133     /// replace only the first occurrence.
134     auto replaceFirst(string str, string fmt)
135     {
136         string r = std.regex.replaceFirst(str, _regex, fmt);
137         return r;
138     }
139 
140     /// search
141     auto search(string str)
142     {
143         auto m = std.regex.match(str, _regex);
144         return m.pre.length;
145     }
146 
147     /// split
148     auto split(string str)
149     {
150         auto result = std.regex.split(str, _regex);
151         return result;
152     }
153 
154     /// exec
155     string[] exec(string str)
156     {
157         string[] result;
158         std.regex.Captures!string mat;
159         if(str == _currentExec)
160         {
161             if(_lastIndex >= _currentExec.length)
162                 return [];
163             mat = std.regex.matchFirst(str[_lastIndex..$], _regex);
164         }
165         else
166         {
167             if(str.length < 1)
168                 return [];
169             _currentExec = str;
170             _lastIndex = 0;
171             mat = std.regex.matchFirst(str, _regex);
172         }
173         if(!mat.empty)
174             _lastIndex += mat.hit.length;
175         else
176             return [];
177         // result ~= mat.hit;
178         foreach(value ; mat)
179         {
180             result ~= value;
181             _lastIndex += value.length;
182         }
183         return result;
184     }
185 
186     /// test
187     bool test(string str)
188     {
189         auto result = exec(str);
190         return result != null;
191     }
192 
193     /// get the string representation
194     override string toString() const 
195     {
196         return "/" ~ _source ~ "/" ~ _flags;
197     }
198 
199 private:
200     string _currentExec; // change _matches if this changes
201     size_t _lastIndex;
202 
203     string _source; // keep track of source
204     string _flags; // keep track of flags
205     std.regex.Regex!char _regex;
206 }
207 
208 /**
209  * Initializes the RegExp constructor. This is not necessary as regex literals are a first class
210  * language feature. Documentation for this library can be found at
211  * https://pillager86.github.io/dmildew/RegExp.html
212  */
213 void initializeRegExpLibrary(Interpreter interpreter)
214 {
215     ScriptAny ctor = new ScriptFunction("RegExp", &native_RegExp_ctor, true);
216     ctor["prototype"] = getRegExpProto();
217     ctor["prototype"]["constructor"] = ctor;
218 
219     interpreter.forceSetGlobal("RegExp", ctor, false);
220 }
221 
222 /// Get the RegExp prototype. This is public because the VM needs it.
223 ScriptObject getRegExpProto()
224 {
225     if(_regExpProto is null)
226     {
227         _regExpProto = new ScriptObject("RegExp", null);
228         
229         _regExpProto.addGetterProperty("flags", new ScriptFunction("RegExp.prototype.flags", &native_RegExp_p_flags));
230         _regExpProto.addGetterProperty("lastIndex", new ScriptFunction("RegExp.prototype.lastIndex",
231                 &native_RegExp_p_lastIndex));
232         _regExpProto.addSetterProperty("lastIndex", new ScriptFunction("RegExp.prototype.lastIndex",
233                 &native_RegExp_p_lastIndex));
234         _regExpProto.addGetterProperty("source", new ScriptFunction("RegExp.prototype.source", 
235                 &native_RegExp_p_source));
236         
237         _regExpProto["dotAll"] = new ScriptFunction("RegExp.prototype.dotAll", &native_RegExp_dotAll);        
238         _regExpProto["global"] = new ScriptFunction("RegExp.prototype.global", &native_RegExp_global);
239         _regExpProto["ignoreCase"] = new ScriptFunction("RegExp.prototype.ignoreCase", &native_RegExp_ignoreCase);
240         _regExpProto["multiline"] = new ScriptFunction("RegExp.prototype.multiline", &native_RegExp_multiline);
241         _regExpProto["match"] = new ScriptFunction("RegExp.prototype.match", &native_RegExp_match);
242         _regExpProto["matchAll"] = new ScriptFunction("RegExp.prototype.matchAll", &native_RegExp_matchAll);
243         _regExpProto["replace"] = new ScriptFunction("RegExp.prototype.replace", &native_RegExp_replace);
244         _regExpProto["search"] = new ScriptFunction("RegExp.prototype.search", &native_RegExp_search);
245         _regExpProto["split"] = new ScriptFunction("RegExp.prototype.split", &native_RegExp_split);
246         _regExpProto["exec"] = new ScriptFunction("RegExp.prototype.exec", &native_RegExp_exec);
247         _regExpProto["test"] = new ScriptFunction("RegExp.prototype.test", &native_RegExp_test);
248     }
249     return _regExpProto;
250 }
251 
252 private ScriptObject _regExpProto;
253 
254 private ScriptAny native_RegExp_ctor(Environment env, ScriptAny* thisObj, ScriptAny[] args, ref NativeFunctionError nfe)
255 {
256     if(!thisObj.isObject)
257         return ScriptAny.UNDEFINED;
258     auto obj = thisObj.toValue!ScriptObject;
259     if(args.length < 1)
260     {
261         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
262         return ScriptAny.UNDEFINED;
263     }
264     auto pattern = args[0].toString();
265     auto flags = args.length > 1 ? args[1].toString() : "";
266     try 
267     {
268         obj.nativeObject = new ScriptRegExp(pattern, flags);
269     }
270     catch(std.regex.RegexException rex)
271     {
272         nfe = NativeFunctionError.RETURN_VALUE_IS_EXCEPTION;
273         return ScriptAny(rex.msg);
274     }
275     return ScriptAny.UNDEFINED;
276 }
277 
278 private ScriptAny native_RegExp_p_flags(Environment env, ScriptAny* thisObj,
279                                         ScriptAny[] args, ref NativeFunctionError nfe)
280 {
281     auto regExp = thisObj.toNativeObject!ScriptRegExp;
282     if(regExp is null)
283     {
284         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
285         return ScriptAny.UNDEFINED;
286     }
287     return ScriptAny(regExp.flags);
288 }
289 
290 private ScriptAny native_RegExp_p_lastIndex(Environment env, ScriptAny* thisObj,
291                                             ScriptAny[] args, ref NativeFunctionError nfe)
292 {
293     auto regExp = thisObj.toNativeObject!ScriptRegExp;
294     if(regExp is null)
295     {
296         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
297         return ScriptAny.UNDEFINED;
298     }
299     if(args.length < 1)
300         return ScriptAny(regExp.lastIndex);
301     immutable index = args[0].toValue!size_t;
302     return ScriptAny(regExp.lastIndex = index);
303 }
304 
305 private ScriptAny native_RegExp_p_source(Environment env, ScriptAny* thisObj,
306                                          ScriptAny[] args, ref NativeFunctionError nfe)
307 {
308     auto regExp = thisObj.toNativeObject!ScriptRegExp;
309     if(regExp is null)
310     {
311         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
312         return ScriptAny.UNDEFINED;
313     }
314     return ScriptAny(regExp.source);
315 }
316 
317 private ScriptAny native_RegExp_dotAll(Environment env, ScriptAny* thisObj,
318                                        ScriptAny[] args, ref NativeFunctionError nfe)
319 {
320     auto regExp = thisObj.toNativeObject!ScriptRegExp;
321     if(regExp is null)
322     {
323         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
324         return ScriptAny.UNDEFINED;
325     }
326     return ScriptAny(regExp.dotAll());
327 }
328 
329 private ScriptAny native_RegExp_global(Environment env, ScriptAny* thisObj,
330                                        ScriptAny[] args, ref NativeFunctionError nfe)
331 {
332     auto regExp = thisObj.toNativeObject!ScriptRegExp;
333     if(regExp is null)
334     {
335         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
336         return ScriptAny.UNDEFINED;
337     }
338     return ScriptAny(regExp.global());
339 }
340 
341 private ScriptAny native_RegExp_ignoreCase(Environment env, ScriptAny* thisObj,
342                                        ScriptAny[] args, ref NativeFunctionError nfe)
343 {
344     auto regExp = thisObj.toNativeObject!ScriptRegExp;
345     if(regExp is null)
346     {
347         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
348         return ScriptAny.UNDEFINED;
349     }
350     return ScriptAny(regExp.ignoreCase());
351 }
352 
353 private ScriptAny native_RegExp_multiline(Environment env, ScriptAny* thisObj,
354                                        ScriptAny[] args, ref NativeFunctionError nfe)
355 {
356     auto regExp = thisObj.toNativeObject!ScriptRegExp;
357     if(regExp is null)
358     {
359         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
360         return ScriptAny.UNDEFINED;
361     }
362     return ScriptAny(regExp.multiline());
363 }
364 
365 private ScriptAny native_RegExp_match(Environment env, ScriptAny* thisObj,
366                                        ScriptAny[] args, ref NativeFunctionError nfe)
367 {
368     auto regExp = thisObj.toNativeObject!ScriptRegExp;
369     if(regExp is null)
370     {
371         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
372         return ScriptAny.UNDEFINED;
373     }
374     if(args.length < 1)
375     {
376         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
377         return ScriptAny.UNDEFINED;
378     }
379     auto str = args[0].toString();
380     auto result = regExp.match(str); // @suppress(dscanner.suspicious.unmodified)
381     return ScriptAny(result);
382 }
383 
384 private ScriptAny native_RegExp_matchAll(Environment env, ScriptAny* thisObj,
385                                          ScriptAny[] args, ref NativeFunctionError nfe)
386 {
387     import std.concurrency: yield;
388     import mildew.stdlib.generator: ScriptGenerator, getGeneratorPrototype;
389 
390     auto regExp = thisObj.toNativeObject!ScriptRegExp;
391     if(regExp is null)
392     {
393         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
394         return ScriptAny.UNDEFINED;
395     }
396     if(args.length < 1)
397     {
398         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
399         return ScriptAny.UNDEFINED;
400     }
401     auto str = args[0].toString();
402     ScriptAny func(Environment env, ScriptAny* thisObj, ScriptAny[] args, ref NativeFunctionError nfe)
403     {
404         auto matches = regExp.matchAll(str);
405         foreach(match; matches)
406             yield!ScriptAny(ScriptAny(match.hit));
407         return ScriptAny.UNDEFINED;
408     }
409     auto generator = new ScriptGenerator(env, new ScriptFunction("iterator", &func), []);
410     auto result = new ScriptObject("Iterator", getGeneratorPrototype, generator);
411     return ScriptAny(result);
412 }
413 
414 private ScriptAny native_RegExp_replace(Environment env, ScriptAny* thisObj,
415                                         ScriptAny[] args, ref NativeFunctionError nfe)
416 {
417     auto regExp = thisObj.toNativeObject!ScriptRegExp;
418     if(regExp is null)
419     {
420         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
421         return ScriptAny.UNDEFINED;
422     }
423     if(args.length < 2)
424     {
425         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
426         return ScriptAny.UNDEFINED;
427     }
428     auto str = args[0].toString();
429     auto fmt = args[1].toString();
430     return ScriptAny(regExp.replace(str, fmt));
431 }
432 
433 private ScriptAny native_RegExp_search(Environment env, ScriptAny* thisObj,
434                                        ScriptAny[] args, ref NativeFunctionError nfe)
435 {
436     auto regExp = thisObj.toNativeObject!ScriptRegExp;
437     if(regExp is null)
438     {
439         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
440         return ScriptAny.UNDEFINED;
441     }
442     if(args.length < 1)
443     {
444         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
445         return ScriptAny.UNDEFINED;
446     }
447     auto str = args[0].toString();
448     return ScriptAny(regExp.search(str));
449 }
450 
451 private ScriptAny native_RegExp_split(Environment env, ScriptAny* thisObj,
452                                        ScriptAny[] args, ref NativeFunctionError nfe)
453 {
454     auto regExp = thisObj.toNativeObject!ScriptRegExp;
455     if(regExp is null)
456     {
457         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
458         return ScriptAny.UNDEFINED;
459     }
460     if(args.length < 1)
461     {
462         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
463         return ScriptAny.UNDEFINED;
464     }
465     auto str = args[0].toString();
466     return ScriptAny(regExp.split(str));
467 }
468 
469 private ScriptAny native_RegExp_exec(Environment env, ScriptAny* thisObj,
470                                        ScriptAny[] args, ref NativeFunctionError nfe)
471 {
472     auto regExp = thisObj.toNativeObject!ScriptRegExp;
473     if(regExp is null)
474     {
475         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
476         return ScriptAny.UNDEFINED;
477     }
478     if(args.length < 1)
479     {
480         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
481         return ScriptAny.UNDEFINED;
482     }
483     auto str = args[0].toString();
484     auto result = regExp.exec(str); // @suppress(dscanner.suspicious.unmodified)
485     return ScriptAny(regExp.exec(str));
486 }
487 
488 private ScriptAny native_RegExp_test(Environment env, ScriptAny* thisObj,
489                                        ScriptAny[] args, ref NativeFunctionError nfe)
490 {
491     auto regExp = thisObj.toNativeObject!ScriptRegExp;
492     if(regExp is null)
493     {
494         nfe = NativeFunctionError.WRONG_TYPE_OF_ARG;
495         return ScriptAny.UNDEFINED;
496     }
497     if(args.length < 1)
498     {
499         nfe = NativeFunctionError.WRONG_NUMBER_OF_ARGS;
500         return ScriptAny.UNDEFINED;
501     }
502     auto str = args[0].toString();
503     return ScriptAny(regExp.test(str));
504 }
505 
506 unittest
507 {
508     import std.stdio: writeln, writefln;
509     auto testString = "foo bar foo bar foo";
510     auto testRegexp = new ScriptRegExp("foo", "g");
511     auto rg2 = new ScriptRegExp("bar");
512     auto result = testRegexp.exec(testString);
513     assert(result != null);
514     while(result)
515     {
516         writeln(result);
517         result = testRegexp.exec(testString);
518     }
519     writeln(rg2.search(testString));
520 }