TextMateLib 1.0
Modern C++ implementation of the TextMate syntax highlighting engine
Loading...
Searching...
No Matches
tokenizeString.cpp
1#include "tokenizeString.h"
2#include "grammar.h"
3#include "rule.h"
4#include <chrono>
5#include <limits>
6#include <algorithm>
7#include <iostream>
8
9namespace tml {
10
11// Forward declarations of helper functions
12static int getFindOptions(bool allowA, bool allowG);
13static void prepareRuleSearch(
14 Rule* rule,
15 Grammar* grammar,
16 const std::string* endRegexSource,
17 bool allowA,
18 bool allowG,
19 CompiledRule*& ruleScanner,
20 int& findOptions
21);
22static void prepareRuleWhileSearch(
23 BeginWhileRule* rule,
24 Grammar* grammar,
25 const std::string* endRegexSource,
26 bool allowA,
27 bool allowG,
28 CompiledRule*& ruleScanner,
29 int& findOptions
30);
31
32// Get find options based on anchor flags
33static int getFindOptions(bool allowA, bool allowG) {
34 int options = FindOption::None;
35 if (!allowA) {
36 options |= FindOption::NotBeginString;
37 }
38 if (!allowG) {
39 options |= FindOption::NotBeginPosition;
40 }
41 return options;
42}
43
44// Prepare rule for scanning
45static void prepareRuleSearch(
46 Rule* rule,
47 Grammar* grammar,
48 const std::string* endRegexSource,
49 bool allowA,
50 bool allowG,
51 CompiledRule*& ruleScanner,
52 int& findOptions
53) {
54 // In the TS version, there's a UseOnigurumaFindOptions flag
55 // For now, we'll use compileAG which handles allowA/allowG
56 ruleScanner = rule->compileAG(grammar, grammar, endRegexSource, allowA, allowG);
57 findOptions = FindOption::None;
58}
59
60// Prepare while rule for scanning
61static void prepareRuleWhileSearch(
62 BeginWhileRule* rule,
63 Grammar* grammar,
64 const std::string* endRegexSource,
65 bool allowA,
66 bool allowG,
67 CompiledRule*& ruleScanner,
68 int& findOptions
69) {
70 ruleScanner = rule->compileWhileAG(grammar, endRegexSource, allowA, allowG);
71 findOptions = FindOption::None;
72}
73
74// Check while conditions
75IWhileCheckResult _checkWhileConditions(
76 Grammar* grammar,
77 OnigString* lineText,
78 bool isFirstLine,
79 int linePos,
80 StateStackImpl* stack,
81 LineTokens* lineTokens
82) {
83 IWhileCheckResult result;
84 result.stack = stack;
85 result.linePos = linePos;
86 result.isFirstLine = isFirstLine;
87 result.anchorPosition = (stack->beginRuleCapturedEOL ? 0 : -1);
88
89 // Collect all while rules in the stack
90 struct WhileStack {
91 StateStackImpl* stack;
92 BeginWhileRule* rule;
93 };
94 std::vector<WhileStack> whileRules;
95
96 for (StateStackImpl* node = stack; node != nullptr; node = node->parent) {
97 Rule* nodeRule = node->getRule(grammar);
98 BeginWhileRule* whileRule = dynamic_cast<BeginWhileRule*>(nodeRule);
99 if (whileRule) {
100 WhileStack ws;
101 ws.stack = node;
102 ws.rule = whileRule;
103 whileRules.push_back(ws);
104 }
105 }
106
107 // Check each while rule from bottom to top
108 std::reverse(whileRules.begin(), whileRules.end());
109
110 for (const auto& whileRuleEntry : whileRules) {
111 CompiledRule* ruleScanner = nullptr;
112 int findOptions = 0;
113 prepareRuleWhileSearch(
114 whileRuleEntry.rule,
115 grammar,
116 whileRuleEntry.stack->endRule,
117 isFirstLine,
118 linePos == result.anchorPosition,
119 ruleScanner,
120 findOptions
121 );
122
123 IOnigMatch* r = ruleScanner->scanner->findNextMatchSync(
124 lineText,
125 linePos,
126 findOptions
127 );
128
129 if (r) {
130 RuleId matchedRuleId = ruleScanner->rules[r->index];
131 if (ruleIdToNumber(matchedRuleId) != ruleIdToNumber(WHILE_RULE_ID)) {
132 // Shouldn't end up here
133 result.stack = whileRuleEntry.stack->parent;
134 break;
135 }
136
137 if (!r->captureIndices.empty()) {
138 lineTokens->produce(whileRuleEntry.stack, r->captureIndices[0].start);
139 handleCaptures(
140 grammar,
141 lineText,
142 isFirstLine,
143 whileRuleEntry.stack,
144 lineTokens,
145 whileRuleEntry.rule->whileCaptures,
146 r->captureIndices
147 );
148 lineTokens->produce(whileRuleEntry.stack, r->captureIndices[0].end);
149 result.anchorPosition = r->captureIndices[0].end;
150 if (r->captureIndices[0].end > linePos) {
151 result.linePos = r->captureIndices[0].end;
152 result.isFirstLine = false;
153 }
154 }
155 delete r;
156 } else {
157 // While condition failed, pop the rule
158 result.stack = whileRuleEntry.stack->parent;
159 break;
160 }
161 }
162
163 return result;
164}
165
166// Match rule or injections
167IMatchResult* matchRuleOrInjections(
168 Grammar* grammar,
169 OnigString* lineText,
170 bool isFirstLine,
171 int linePos,
172 StateStackImpl* stack,
173 int anchorPosition
174) {
175 // Look for normal grammar rule
176 IMatchResult* matchResult = matchRule(
177 grammar,
178 lineText,
179 isFirstLine,
180 linePos,
181 stack,
182 anchorPosition
183 );
184
185 // Look for injected rules
186 std::vector<Injection> injections = grammar->getInjections();
187 if (injections.empty()) {
188 // No injections whatsoever => early return
189 return matchResult;
190 }
191
192 IMatchInjectionsResult* injectionResult = matchInjections(
193 injections,
194 grammar,
195 lineText,
196 isFirstLine,
197 linePos,
198 stack,
199 anchorPosition
200 );
201
202 if (!injectionResult) {
203 // No injections matched => early return
204 return matchResult;
205 }
206
207 if (!matchResult) {
208 // Only injections matched => convert and return
209 IMatchResult* result = new IMatchResult();
210 result->captureIndices = injectionResult->captureIndices;
211 result->matchedRuleId = injectionResult->matchedRuleId;
212 delete injectionResult;
213 return result;
214 }
215
216 // Decide if matchResult or injectionResult should win
217 int matchResultScore = matchResult->captureIndices[0].start;
218 int injectionResultScore = injectionResult->captureIndices[0].start;
219
220 if (injectionResultScore < matchResultScore ||
221 (injectionResult->priorityMatch && injectionResultScore == matchResultScore)) {
222 // Injection won!
223 IMatchResult* result = new IMatchResult();
224 result->captureIndices = injectionResult->captureIndices;
225 result->matchedRuleId = injectionResult->matchedRuleId;
226 delete matchResult;
227 delete injectionResult;
228 return result;
229 }
230
231 delete injectionResult;
232 return matchResult;
233}
234
235// Match rule
236IMatchResult* matchRule(
237 Grammar* grammar,
238 OnigString* lineText,
239 bool isFirstLine,
240 int linePos,
241 StateStackImpl* stack,
242 int anchorPosition
243) {
244 Rule* rule = stack->getRule(grammar);
245 if (!rule) {
246 return nullptr;
247 }
248
249 CompiledRule* ruleScanner = nullptr;
250 int findOptions = 0;
251 prepareRuleSearch(
252 rule,
253 grammar,
254 stack->endRule,
255 isFirstLine,
256 linePos == anchorPosition,
257 ruleScanner,
258 findOptions
259 );
260
261 IOnigMatch* r = ruleScanner->scanner->findNextMatchSync(
262 lineText,
263 linePos,
264 findOptions
265 );
266
267 if (r) {
268 IMatchResult* result = new IMatchResult();
269 result->captureIndices = r->captureIndices;
270 result->matchedRuleId = ruleScanner->rules[r->index];
271 delete r;
272 return result;
273 }
274
275 return nullptr;
276}
277
278// Match injections
279IMatchInjectionsResult* matchInjections(
280 const std::vector<Injection>& injections,
281 Grammar* grammar,
282 OnigString* lineText,
283 bool isFirstLine,
284 int linePos,
285 StateStackImpl* stack,
286 int anchorPosition
287) {
288 // The lower the better
289 int bestMatchRating = std::numeric_limits<int>::max();
290 std::vector<IOnigCaptureIndex> bestMatchCaptureIndices;
291 RuleId bestMatchRuleId = ruleIdFromNumber(-1);
292 int bestMatchResultPriority = 0;
293 bool foundMatch = false;
294
295 std::vector<std::string> scopes = stack->contentNameScopesList->getScopeNames();
296
297 for (size_t i = 0; i < injections.size(); i++) {
298 const Injection& injection = injections[i];
299 if (!injection.matcher(scopes)) {
300 // Injection selector doesn't match stack
301 continue;
302 }
303
304 Rule* rule = grammar->getRule(injection.ruleId);
305 if (!rule) {
306 continue;
307 }
308
309 CompiledRule* ruleScanner = nullptr;
310 int findOptions = 0;
311 prepareRuleSearch(
312 rule,
313 grammar,
314 nullptr,
315 isFirstLine,
316 linePos == anchorPosition,
317 ruleScanner,
318 findOptions
319 );
320
321 IOnigMatch* matchResult = ruleScanner->scanner->findNextMatchSync(
322 lineText,
323 linePos,
324 findOptions
325 );
326
327 if (!matchResult) {
328 continue;
329 }
330
331 int matchRating = matchResult->captureIndices[0].start;
332 if (matchRating >= bestMatchRating) {
333 // Injections are sorted by priority, so previous injection had better priority
334 delete matchResult;
335 continue;
336 }
337
338 bestMatchRating = matchRating;
339 bestMatchCaptureIndices = matchResult->captureIndices;
340 bestMatchRuleId = ruleScanner->rules[matchResult->index];
341 bestMatchResultPriority = injection.priority;
342 foundMatch = true;
343
344 delete matchResult;
345
346 if (bestMatchRating == linePos) {
347 // No more need to look at the rest of the injections
348 break;
349 }
350 }
351
352 if (foundMatch) {
353 IMatchInjectionsResult* result = new IMatchInjectionsResult();
354 result->priorityMatch = (bestMatchResultPriority == -1);
355 result->captureIndices = bestMatchCaptureIndices;
356 result->matchedRuleId = bestMatchRuleId;
357 return result;
358 }
359
360 return nullptr;
361}
362
363// Handle captures
364void handleCaptures(
365 Grammar* grammar,
366 OnigString* lineText,
367 bool isFirstLine,
368 StateStackImpl* stack,
369 LineTokens* lineTokens,
370 const std::vector<CaptureRule*>& captures,
371 const std::vector<IOnigCaptureIndex>& captureIndices
372) {
373 if (captures.empty()) {
374 return;
375 }
376
377 const std::string& lineTextContent = lineText->content();
378
379 size_t len = std::min(captures.size(), captureIndices.size());
380 std::vector<LocalStackElement> localStack;
381 int maxEnd = captureIndices[0].end;
382
383 for (size_t i = 0; i < len; i++) {
384 const CaptureRule* captureRule = captures[i];
385 if (captureRule == nullptr) {
386 // Not interested
387 continue;
388 }
389
390 const IOnigCaptureIndex& captureIndex = captureIndices[i];
391
392 if (captureIndex.length == 0) {
393 // Nothing really captured
394 continue;
395 }
396
397 if (captureIndex.start > maxEnd) {
398 // Capture going beyond consumed string
399 break;
400 }
401
402 // Pop captures while needed
403 while (!localStack.empty() &&
404 localStack.back().endPos <= captureIndex.start) {
405 // Pop!
406 lineTokens->produceFromScopes(
407 localStack.back().scopes,
408 localStack.back().endPos
409 );
410 localStack.pop_back();
411 }
412
413 if (!localStack.empty()) {
414 lineTokens->produceFromScopes(
415 localStack.back().scopes,
416 captureIndex.start
417 );
418 } else {
419 lineTokens->produce(stack, captureIndex.start);
420 }
421
422 if (ruleIdToNumber(captureRule->retokenizeCapturedWithRuleId) != 0) {
423 // The capture requires additional matching
424 std::string* scopeName = captureRule->getName(
425 &lineTextContent,
426 &captureIndices
427 );
428 AttributedScopeStack* nameScopesList =
429 stack->contentNameScopesList->pushAttributed(
430 scopeName ? *scopeName : "",
431 grammar
432 );
433
434 std::string* contentName = captureRule->getContentName(
435 lineTextContent,
436 captureIndices
437 );
438 AttributedScopeStack* contentNameScopesList =
439 nameScopesList->pushAttributed(
440 contentName ? *contentName : "",
441 grammar
442 );
443
444 StateStackImpl* stackClone = stack->push(
445 captureRule->retokenizeCapturedWithRuleId,
446 captureIndex.start,
447 -1,
448 false,
449 nullptr,
450 nameScopesList,
451 contentNameScopesList
452 );
453
454 std::string substring = lineTextContent.substr(0, captureIndex.end);
455 OnigString* onigSubStr = grammar->createOnigString(substring);
456 StackElement subResult = tokenizeString(
457 grammar,
458 onigSubStr,
459 (isFirstLine && captureIndex.start == 0),
460 captureIndex.start,
461 stackClone,
462 lineTokens,
463 false,
464 0 // No time limit
465 );
466
467 delete onigSubStr;
468 delete scopeName;
469 delete contentName;
470 continue;
471 }
472
473 std::string* captureRuleScopeName = captureRule->getName(
474 &lineTextContent,
475 &captureIndices
476 );
477
478 if (captureRuleScopeName != nullptr && !captureRuleScopeName->empty()) {
479 // Push
480 AttributedScopeStack* base = !localStack.empty()
481 ? localStack.back().scopes
482 : stack->contentNameScopesList;
483 AttributedScopeStack* captureRuleScopesList =
484 base->pushAttributed(*captureRuleScopeName, grammar);
485 localStack.push_back(
486 LocalStackElement(captureRuleScopesList, captureIndex.end)
487 );
488 }
489
490 delete captureRuleScopeName;
491 }
492
493 while (!localStack.empty()) {
494 // Pop!
495 lineTokens->produceFromScopes(
496 localStack.back().scopes,
497 localStack.back().endPos
498 );
499 localStack.pop_back();
500 }
501}
502
503// Main tokenization function
504StackElement tokenizeString(
505 Grammar* grammar,
506 OnigString* lineText,
507 bool isFirstLine,
508 int linePos,
509 StateStackImpl* stack,
510 LineTokens* lineTokens,
511 bool checkWhileConditions,
512 int timeLimit
513) {
514 StackElement result;
515 int lineLength = lineText->content().length();
516 bool STOP = false;
517 int anchorPosition = -1;
518
519 if (checkWhileConditions) {
520 IWhileCheckResult whileCheckResult = _checkWhileConditions(
521 grammar,
522 lineText,
523 isFirstLine,
524 linePos,
525 stack,
526 lineTokens
527 );
528 stack = whileCheckResult.stack;
529 linePos = whileCheckResult.linePos;
530 isFirstLine = whileCheckResult.isFirstLine;
531 anchorPosition = whileCheckResult.anchorPosition;
532 }
533
534 auto startTime = std::chrono::steady_clock::now();
535
536 // Main scanning loop
537 while (!STOP) {
538
539 // Check time limit
540 if (timeLimit != 0) {
541 auto currentTime = std::chrono::steady_clock::now();
542 auto elapsedTime = std::chrono::duration_cast<std::chrono::milliseconds>(
543 currentTime - startTime
544 ).count();
545 if (elapsedTime > timeLimit) {
546 result.stack = stack;
547 result.stoppedEarly = true;
548 return result;
549 }
550 }
551
552 // scanNext logic
553 IMatchResult* r = matchRuleOrInjections(
554 grammar,
555 lineText,
556 isFirstLine,
557 linePos,
558 stack,
559 anchorPosition
560 );
561
562 if (!r) {
563 // No match
564 lineTokens->produce(stack, lineLength);
565 STOP = true;
566 continue;
567 }
568
569 const std::vector<IOnigCaptureIndex>& captureIndices = r->captureIndices;
570 RuleId matchedRuleId = r->matchedRuleId;
571
572 bool hasAdvanced = !captureIndices.empty() &&
573 captureIndices[0].end > linePos;
574
575 if (ruleIdToNumber(matchedRuleId) == ruleIdToNumber(END_RULE_ID)) {
576 // We matched the `end` for this rule => pop it
577 BeginEndRule* poppedRule = dynamic_cast<BeginEndRule*>(
578 stack->getRule(grammar)
579 );
580
581 lineTokens->produce(stack, captureIndices[0].start);
582 stack = stack->withContentNameScopesList(stack->nameScopesList);
583 handleCaptures(
584 grammar,
585 lineText,
586 isFirstLine,
587 stack,
588 lineTokens,
589 poppedRule->endCaptures,
590 captureIndices
591 );
592 lineTokens->produce(stack, captureIndices[0].end);
593
594 // Pop
595 StateStackImpl* popped = stack;
596 stack = stack->parent;
597 anchorPosition = popped->getAnchorPos();
598
599 if (!hasAdvanced && popped->getEnterPos() == linePos) {
600 // Grammar pushed & popped a rule without advancing
601 // Assume this was a mistake and continue in this state
602 stack = popped;
603 lineTokens->produce(stack, lineLength);
604 STOP = true;
605 delete r;
606 continue;
607 }
608 } else {
609 // We matched a rule!
610 Rule* _rule = grammar->getRule(matchedRuleId);
611
612 lineTokens->produce(stack, captureIndices[0].start);
613
614 StateStackImpl* beforePush = stack;
615
616 // Push it on the stack rule
617 std::string* scopeName = _rule->getName(
618 &lineText->content(),
619 &captureIndices
620 );
621 AttributedScopeStack* nameScopesList =
622 stack->contentNameScopesList->pushAttributed(
623 scopeName ? *scopeName : "",
624 grammar
625 );
626 stack = stack->push(
627 matchedRuleId,
628 linePos,
629 anchorPosition,
630 captureIndices[0].end == lineLength,
631 nullptr,
632 nameScopesList,
633 nameScopesList
634 );
635
636 delete scopeName;
637
638 BeginEndRule* beginEndRule = dynamic_cast<BeginEndRule*>(_rule);
639 if (beginEndRule) {
640 handleCaptures(
641 grammar,
642 lineText,
643 isFirstLine,
644 stack,
645 lineTokens,
646 beginEndRule->beginCaptures,
647 captureIndices
648 );
649 lineTokens->produce(stack, captureIndices[0].end);
650 anchorPosition = captureIndices[0].end;
651
652 std::string* contentName = beginEndRule->getContentName(
653 lineText->content(),
654 captureIndices
655 );
656 AttributedScopeStack* contentNameScopesList =
657 nameScopesList->pushAttributed(
658 contentName ? *contentName : "",
659 grammar
660 );
661 stack = stack->withContentNameScopesList(contentNameScopesList);
662
663 delete contentName;
664
665 if (beginEndRule->endHasBackReferences) {
666 std::string endRule = beginEndRule->getEndWithResolvedBackReferences(
667 lineText->content(),
668 captureIndices
669 );
670 stack = stack->withEndRule(endRule);
671 }
672
673 if (!hasAdvanced && beforePush->hasSameRuleAs(stack)) {
674 // Grammar pushed the same rule without advancing
675 stack = stack->pop();
676 lineTokens->produce(stack, lineLength);
677 STOP = true;
678 delete r;
679 continue;
680 }
681 } else {
682 BeginWhileRule* beginWhileRule = dynamic_cast<BeginWhileRule*>(_rule);
683 if (beginWhileRule) {
684 handleCaptures(
685 grammar,
686 lineText,
687 isFirstLine,
688 stack,
689 lineTokens,
690 beginWhileRule->beginCaptures,
691 captureIndices
692 );
693 lineTokens->produce(stack, captureIndices[0].end);
694 anchorPosition = captureIndices[0].end;
695
696 std::string* contentName = beginWhileRule->getContentName(
697 lineText->content(),
698 captureIndices
699 );
700 AttributedScopeStack* contentNameScopesList =
701 nameScopesList->pushAttributed(
702 contentName ? *contentName : "",
703 grammar
704 );
705 stack = stack->withContentNameScopesList(contentNameScopesList);
706
707 delete contentName;
708
709 if (beginWhileRule->whileHasBackReferences) {
710 std::string whileRule =
711 beginWhileRule->getWhileWithResolvedBackReferences(
712 lineText->content(),
713 captureIndices
714 );
715 stack = stack->withEndRule(whileRule);
716 }
717
718 if (!hasAdvanced && beforePush->hasSameRuleAs(stack)) {
719 // Grammar pushed the same rule without advancing
720 stack = stack->pop();
721 lineTokens->produce(stack, lineLength);
722 STOP = true;
723 delete r;
724 continue;
725 }
726 } else {
727 // MatchRule
728 MatchRule* matchingRule = dynamic_cast<MatchRule*>(_rule);
729 if (matchingRule) {
730 handleCaptures(
731 grammar,
732 lineText,
733 isFirstLine,
734 stack,
735 lineTokens,
736 matchingRule->captures,
737 captureIndices
738 );
739 lineTokens->produce(stack, captureIndices[0].end);
740
741 // Pop rule immediately since it is a MatchRule
742 stack = stack->pop();
743
744 if (!hasAdvanced) {
745 // Grammar is not advancing, nor is it pushing/popping
746 stack = stack->safePop();
747 lineTokens->produce(stack, lineLength);
748 STOP = true;
749 delete r;
750 continue;
751 }
752 }
753 }
754 }
755 }
756
757 if (!captureIndices.empty() && captureIndices[0].end > linePos) {
758 // Advance stream
759 linePos = captureIndices[0].end;
760 isFirstLine = false;
761 }
762
763 delete r;
764 }
765
766 result.stack = stack;
767 result.stoppedEarly = false;
768 result.linePos = linePos;
769 result.anchorPosition = anchorPosition;
770 return result;
771}
772
773} // namespace tml
RuleId ruleIdFromNumber(int id)
Convert an integer to a RuleId.
Definition types.h:109
const RuleId END_RULE_ID(-1)
Special rule ID indicating the end of a matched region.
int ruleIdToNumber(RuleId id)
Convert a RuleId to its integer value.
Definition types.h:116
const RuleId WHILE_RULE_ID(-2)
Special rule ID for 'while' rule matching.