捕获空格和引号内的单词?

huangapple go评论72阅读模式
英文:

Capturing words within spaces and quotation marks?

问题

以下是翻译好的部分:

"想法,正如标题中明确指出的,是捕获位于空格和引号内的单词。这里是我们处理的输入的示例:

The Brown "Fox Jumps Over" "The Lazy" Dog

目前我的代码可以捕获空格内的单词,如许多人所知,基本的strtok()足够了。以下是我的代码:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>

int main () {
   char command[BUFSIZ];
   char *token;
   fgets(command,BUFSIZ,stdin);

   token = strtok(command, " ");

   while( token != NULL ) {
      printf( " %s\n", token );

      token = strtok(NULL, " ");
   }

   return 0;
}

而且如预期的那样,我的代码输出如下:

The
Brown
"Fox
Jumps
Over"
"The
Lazy"
Dog

但整个想法和问题是获得以下输出:

The
Brown
Fox Jumps Over
The Lazy
Dog

欢迎提供任何帮助,提前感谢。
(附注:所包括的库是唯一允许的。)

英文:

The idea, explicit in the title, is to capture words within spaces and quotation marks here's an example of the input we are dealing with:

Input:

The Brown "Fox Jumps Over" "The Lazy" Dog

Currently my code can capture words within spaces, as many of you know, a basic strtok() is enough. Here's my code so far:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>

int main () {
   char command[BUFSIZ];
   char *token;
   fgets(command,BUFSIZ,stdin);
   
   token = strtok(command, " ");

   while( token != NULL ) {
      printf( " %s\n", token );
    
      token = strtok(NULL, " ");
   }
   
   return 0;
}

And as expected, my code prints the following:

Current Output:

The
Brown
"Fox
Jumps
Over"
"The
Lazy"
Dog

But the whole idea and problem is to get the following output:

The
Brown
Fox Jumps Over
The Lazy
Dog

All the help is welcome and I thank you in advance.
(PS: The included libraries are the only ones allowed.)

答案1

得分: 2

这个程序适用于您的输入,它使用了一个小型状态机,可以防止在引号内拆分。strtok 对于比单个拆分令牌更复杂的情况来说相当有限,我个人认为如此:

#include <stdio.h>
#include <stdlib.h>

void prn(char* str) {
    printf("<< %s >>\n", str);
}

int main(){
    char command[BUFSIZ];
    char state = 0;
    char *start = NULL;
    char *cur = NULL;
    
    fgets(command, BUFSIZ, stdin);
    start = cur = command;
    
    while (*cur) {
        if (state == 0 && *cur == ' ') {
            /* space outside quotes */
            *cur = 0;
            prn(start);
            start = cur+1;
            cur++;
        } else if (*cur == '"') {
            /* quote found */
            *cur = 0;
            if (state) {
                /* end quote -- print */
                prn(start);
                
                /* skip past spaces */
                cur++;
                while (*cur == ' ')
                    cur++;
            } else {
                /* in quote, move cursor forward */
                cur++;
            }
            /* flip state and reset start */
            state ^= 1;
            start = cur;
        } else {
            cur++;
        }
        if (cur - command >= BUFSIZ) {
            fprintf(stderr, "Buffer overrun\n");
            return -1;
        }
    }
    /* print the last string */
    prn(start);
    
    return 0;
}

输出:

➜ echo -n 'The Brown "Fox Jumps Over" "The Lazy" Dog' |./a.out
<< The >>
<< Brown >>
<< Fox Jumps Over >>
<< The Lazy >>
<< Dog >>
英文:

This program works for your input, it employs a tiny state machine that prevents splitting between quotes. strtok is pretty limited for cases more complicated than a single split token IMO:

#include &lt;stdio.h&gt;
#include &lt;stdlib.h&gt;

void prn(char* str) {
    printf(&quot;&lt;&lt; %s &gt;&gt;\n&quot;, str);
}

int main(){
    char command[BUFSIZ];
    char state = 0;
    char *start = NULL;
    char *cur = NULL;
    
    fgets(command, BUFSIZ, stdin);
    start = cur = command;
    
    while (*cur) {
        if (state == 0 &amp;&amp; *cur == &#39; &#39;) {
            /* space outside quotes */
            *cur = 0;
            prn(start);
            start = cur+1;
            cur++;
        } else if (*cur == &#39;&quot;&#39;) {
            /* quote found */
            *cur = 0;
            if (state) {
                /* end quote -- print */
                prn(start);
                
                /* skip past spaces */
                cur++;
                while (*cur == &#39; &#39;)
                    cur++;
            } else {
                /* in quote, move cursor forward */
                cur++;
            }
            /* flip state and reset start */
            state ^= 1;
            start = cur;
        } else {
            cur++;
        }
        if (cur - command &gt;= BUFSIZ) {
            fprintf(stderr, &quot;Buffer overrun\n&quot;);
            return -1;
        }
    }
    /* print the last string */
    prn(start);
    
    return 0;
}

The output:

➜ echo -n &#39;The Brown &quot;Fox Jumps Over&quot; &quot;The Lazy&quot; Dog&#39; |./a.out
&lt;&lt; The &gt;&gt;
&lt;&lt; Brown &gt;&gt;
&lt;&lt; Fox Jumps Over &gt;&gt;
&lt;&lt; The Lazy &gt;&gt;
&lt;&lt; Dog &gt;&gt;

[edit: tidied following feedback, printing delimited to catch any sneaky spaces creeping through]

答案2

得分: 0

This also uses a state machine. Unlike the other answer, it ignores quotes inside words by having 3 states, empty, word, and quote. It also has error detection, and handles tabs and multiple spaces, but is more complex.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <assert.h>

/*!conditions:re2c*/

static int scan(char **text, const char **token) {
    char *YYCURSOR = *text, *yyt1, *open = 0, *close = 0;
    enum YYCONDTYPE condition = yycempty;
    assert(text && token);
    /*!re2c /**/
    re2c:define:YYCTYPE = char;
    re2c:yyfill:enable = 0;
    re2c:define:YYGETCONDITION = "condition";
    re2c:define:YYSETCONDITION = "condition = @@;";
    re2c:define:YYGETCONDITION:naked = 1;
    re2c:define:YYSETCONDITION:naked = 1;

    space = [ \t\v\n];
    nul = "\x00";
    quote = "\"";
    */
    for( ; ; ) { /*!re2c /**/
        <empty> nul { return *token = 0, 0; }
        <empty> space+ { continue; } /* Leading space. */
        <empty> quote @open => quote
        <empty> @open * => word

        <word> nul { return *token = open, 1; }
        <word> @close space
            { *close = '\0'; *text = close + 1; *token = open; return 1; }
        <word> * { continue; }

        <quote> @close quote
            { *close = '\0'; *text = close + 1; *token = open; return 1; }
        <quote> nul { return errno = EILSEQ, 0; }
        <quote> * { continue; }
        */
    }
}

int main(void) {
    int success = EXIT_SUCCESS;
    char command[BUFSIZ], *input;
    const char *token;
    errno = 0;
    if (!(input = fgets(command, BUFSIZ, stdin))) goto catch;
    while (scan(&input, &token)) printf("%s\n", token);
    if (errno) goto catch;
    goto finally;
catch:
    success = EXIT_FAILURE;
    if (errno) perror("capture");
finally:
    return success;
}

Uses re2c as re2c -W -T -c -o main.re.c main.c to generate the code.

英文:

This also uses a state machine. Unlike the other answer, it ignores quotes inside words by having 3 states, empty, word, and quote. It also has error detection, and handles tabs and multiple spaces, but is more complex.

#include &lt;stdlib.h&gt;
#include &lt;stdio.h&gt;
#include &lt;string.h&gt;
#include &lt;ctype.h&gt;
#include &lt;errno.h&gt;
#include &lt;assert.h&gt;
/*!conditions:re2c*/
static int scan(char **text, const char **token) {
char *YYCURSOR = *text, *yyt1, *open = 0, *close = 0;
enum YYCONDTYPE condition = yycempty;
assert(text &amp;&amp; token);
/*!re2c /**/
re2c:define:YYCTYPE = char;
re2c:yyfill:enable = 0;
re2c:define:YYGETCONDITION = &quot;condition&quot;;
re2c:define:YYSETCONDITION = &quot;condition = @@;&quot;;
re2c:define:YYGETCONDITION:naked = 1;
re2c:define:YYSETCONDITION:naked = 1;
space = [ \t\v\n];
nul = &quot;\x00&quot;;
quote = &quot;\&quot;&quot;;
*/
for( ; ; ) { /*!re2c /**/
&lt;empty&gt; nul { return *token = 0, 0; }
&lt;empty&gt; space+ { continue; } /* Leading space. */
&lt;empty&gt; quote @open :=&gt; quote
&lt;empty&gt; @open * :=&gt; word
&lt;word&gt; nul { return *token = open, 1; }
&lt;word&gt; @close space
{ *close = &#39;\0&#39;; *text = close + 1; *token = open; return 1; }
&lt;word&gt; * { continue; }
&lt;quote&gt; @close quote
{ *close = &#39;\0&#39;; *text = close + 1; *token = open; return 1; }
&lt;quote&gt; nul { return errno = EILSEQ, 0; }
&lt;quote&gt; * { continue; }
*/
}
}
int main(void) {
int success = EXIT_SUCCESS;
char command[BUFSIZ], *input;
const char *token;
errno = 0;
if(!(input = fgets(command,BUFSIZ,stdin))) goto catch;
while(scan(&amp;input, &amp;token)) printf( &quot;%s\n&quot;, token );
if(errno) goto catch;
goto finally;
catch:
success = EXIT_FAILURE;
if(errno) perror(&quot;capture&quot;);
finally:
return success;
}

Uses re2c as re2c -W -T -c -o main.re.c main.c to generate the code.

huangapple
  • 本文由 发表于 2023年3月21日 01:53:41
  • 转载请务必保留本文链接:https://go.coder-hub.com/75793687-4.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定