import tokenize

# I don't think this is a minimized state machine, but it's clearer this
# way. Namely, the class vs. function states can be merged

# In the start of the module when we're expecting possibly a string that gets marked as a docstring
STATE_EXPECT_MODULE_DOCSTRING = 0
# After seeing the class keyword, we're waiting for the block colon (and do bracket counting)
STATE_EXPECT_CLASS_COLON = 1
# After seeing the colon in a class definition we're expecting possibly a docstring
STATE_EXPECT_CLASS_DOCSTRING = 2
# Same as EXPECT_CLASS_COLON, but for function definitions
STATE_EXPECT_FUNCTION_COLON = 3
# Same as EXPECT_CLASS_DOCSTRING, but for function definitions
STATE_EXPECT_FUNCTION_DOCSTRING = 4
# Just skipping tokens until we observe a class or a def.
STATE_OTHER = 5

# These tokens don't matter here - they don't get in the way of docstrings
TOKENS_TO_IGNORE = [
    tokenize.NEWLINE,
    tokenize.INDENT,
    tokenize.DEDENT,
    tokenize.NL,
    tokenize.COMMENT,
]


def get_docstring_tokens(tokens):
    state = STATE_EXPECT_MODULE_DOCSTRING
    # The number of currently open parentheses, square brackets, etc.
    # This doesn't check if they're properly balanced, i.e. there isn't ([)], but we shouldn't
    # need to - if they aren't, it shouldn't parse at all, so we ignore the bracket type
    bracket_count = 0
    docstring_tokens = set()

    for token in tokens:
        if token.type in TOKENS_TO_IGNORE:
            continue
        if token.type == tokenize.STRING:
            if state in [STATE_EXPECT_MODULE_DOCSTRING, STATE_EXPECT_CLASS_DOCSTRING,
                         STATE_EXPECT_FUNCTION_DOCSTRING]:
                docstring_tokens.add(token)
                state = STATE_OTHER
        # A class means we'll expect the class token
        elif token.type == tokenize.NAME and token.string == 'class':
            state = STATE_EXPECT_CLASS_COLON
            # Just in case - they should be balanced normally
            bracket_count = 0
        # A def means we'll expect a colon after that
        elif token.type == tokenize.NAME and token.string == 'def':
            state = STATE_EXPECT_FUNCTION_COLON
            # Just in case - they should be balanced normally
            bracket_count = 0
        # If we get a colon and we're expecting it, move to the next state
        elif token.type == tokenize.OP and token.string == ':':
            # If there are still left brackets open, it must be something other than the block start
            if bracket_count == 0:
                if state == STATE_EXPECT_CLASS_COLON:
                    state = STATE_EXPECT_CLASS_DOCSTRING
                elif state == STATE_EXPECT_FUNCTION_COLON:
                    state = STATE_EXPECT_FUNCTION_DOCSTRING
        # Count opening and closing brackets in bracket_count
        elif token.type == tokenize.OP and token.string in ['(', '[', '{']:
            bracket_count += 1
        elif token.type == tokenize.OP and token.string in [')', ']', '}']:
            bracket_count -= 1
        # The token is not one of the recognized types. If we're expecting a colon, then all good,
        # but if we're expecting a docstring, it would no longer be a docstring
        elif state in [STATE_EXPECT_MODULE_DOCSTRING, STATE_EXPECT_CLASS_DOCSTRING,
                       STATE_EXPECT_FUNCTION_DOCSTRING]:
            state = STATE_OTHER

    return docstring_tokens