Module:Delink

From Frontierpedia, the Microsoft Agent encyclopedia
Revision as of 07:50, 5 April 2013 by en>Mr. Stradivarius (html comments are probably mostly removed by the parser anyway - testing)

Documentation for this module may be created at Module:Delink/doc

-- This module de-links most wikitext.

p = {}

local function delinkReversePipeTrick(s)
    if mw.ustring.match(s, "^%[%[|.*|") then -- Check for multiple pipes.
        return s
    else
        return mw.ustring.match(s, "%[%[|(.*)%]%]")
    end
end

local function delinkPipeTrick(s)
    local linkarea, display = "", ""
    -- We need to deal with colons, brackets, and commas, per [[Help:Pipe trick]].
    
    -- First, remove the text before the first colon, if any.
    if mw.ustring.match(s, ":") then
        s = mw.ustring.match(s, "%[%[.-:(.*)|%]%]")
    -- If there are no colons, grab all of the text apart from the square brackets and the pipe.
    else
        s = mw.ustring.match(s, "%[%[(.*)|%]%]")
    end
    
    -- Next up, brackets and commas.
    if mw.ustring.match(s, "%(.-%)$") then -- Brackets trump commas.
        s = mw.ustring.match(s, "(.-) ?%(.-%)$")
    elseif mw.ustring.match(s, ",") then -- If there are no brackets, display only the text before the first comma.
        s = mw.ustring.match(s, "(.-),.*$")
    end
    return s
end

local function delinkWikilink(s)
    -- Deal with the reverse pipe trick.
    if mw.ustring.match(s, "%[%[|") then
        return delinkReversePipeTrick(s)
    end
    
    -- Check for bad titles. To do this we need to find the
    -- title area of the link, i.e. the part before any pipes.
    local titlearea
    if mw.ustring.match(s, "|") then -- Find if we're dealing with a piped link.
        titlearea = mw.ustring.match(s, "^%[%[(.-)|.*%]%]")
    else
        titlearea = mw.ustring.match(s, "^%[%[(.-)%]%]")
    end
    -- If the title area is not a valid title, return the whole string.
    -- Use pcall in case we're over the expensive functions limit.
    local goodcall, title = pcall(mw.title.new, titlearea, "")
    if not (goodcall and title) then
        return s
    end
    -- Check for characters that are allowed in titles but not in wikilinks.
    local other_invalid_link_strings = { '�' }
    for i,v in ipairs(other_invalid_link_strings) do
        if mw.ustring.match(titlearea, v) then
            return s
        end
    end
    
    -- Check for categories, interwikis, and files.
    local colonprefix = mw.ustring.match(s, "%[%[(.-):.*%]%]") or "" -- Get the text before the first colon.
    if mw.language.isKnownLanguageTag(colonprefix)
    or mw.ustring.match(colonprefix, "^[Cc]ategory$")
    or mw.ustring.match(colonprefix, "^[Ff]ile$")
    or mw.ustring.match(colonprefix, "^[Ii]mage$") then
        return ""
    end
    
    -- Remove the colon if the link is using the [[Help:Colon trick]].
    if mw.ustring.match(s, "%[%[:") then
        s = "[[" .. mw.ustring.match(s, "%[%[:(.*%]%])")
    end
    
    -- Deal with links using the [[Help:Pipe trick]].
    if mw.ustring.match(s, "^%[%[[^|]*|%]%]") then
        return delinkPipeTrick(s)
    end
    
    -- Find the display area of the wikilink
    local display
    if mw.ustring.match(s, "|") then -- Find if we're dealing with a piped link.
        display = mw.ustring.match(s, "^%[%[.-|(.+)%]%]")
    else
        display = mw.ustring.match(s, "^%[%[(.-)%]%]")
    end

    return display
end

local function delinkURL(s)
    -- Assume we have already delinked internal wikilinks, and that
    -- we have been passed some text between two square brackets [foo].
    
    -- Check if the text has a valid URL prefix and at least one valid URL character.
    local valid_url_prefixes = {"//", "http://", "https://", "ftp://", "gopher://", "mailto:", "news:", "irc://"} 
    local url_prefix
    for i,v in ipairs(valid_url_prefixes) do
        if mw.ustring.match(s, '^%[' .. v ..'[^"%s].*%]' ) then
            url_prefix = v
            break
        end
    end
    
    -- Get display text
    if not url_prefix then
        return s
    else
        s = mw.ustring.match(s, "^%[" .. url_prefix .. "(.*)%]") -- Grab all of the text after the URL prefix and before the final square bracket.
        s = mw.ustring.match(s, '^.-(["<> ].*)') or "" -- Grab all of the text after the first URL separator character ("<> ).
        s = mw.ustring.match(s, "^%s*(%S.*)$") or "" -- If the separating character was a space, trim it off.
        return s
    end
end

local function delinkLinkClass(s, pattern, delinkFunction)
    if not type(s) == "string" then
        error("Attempt to de-link non-string input.", 2)
    end
    if not ( type(pattern) == "string" and mw.ustring.sub(pattern, 1, 1) == "^" ) then
        error('Invalid pattern detected. Patterns must begin with "^".', 2)
    end
    -- Iterate over the text string, and replace any matched text. using the 
    -- delink function. We need to iterate character by character rather 
    -- than just use gsub, otherwise nested links aren't detected properly.
    local result = ""
    while mw.ustring.len(s) > 0 do
        -- Replace text using one iteration of gsub.
        s = mw.ustring.gsub(s, pattern, delinkFunction, 1)
        -- Append the left-most character to the result string.
        result = result .. mw.ustring.sub(s, 1, 1)
        s = mw.ustring.sub(s, 2, -1)
    end
    return result
end

local function _delink(args)
    local text = args[1] or ""
--    text = mw.ustring.gsub(text, "<!%-%-.-%-%->", "") -- Remove html comments.
    text = delinkLinkClass(text, "^%[%[.-%]%]", delinkWikilink) -- De-link wikilinks.
    text = delinkLinkClass(text, "^%[.-%]", delinkURL) -- De-link URLs.
    text = mw.ustring.gsub(text, "%s+", " ") -- Remove extra whitespace.
    return text
end

function p.delink(frame)
    local args
    if frame == mw.getCurrentFrame() then
        -- We're being called via #invoke. If the invoking template passed any args, use
        -- them. Otherwise, use the args that were passed into the template.
        args = frame:getParent().args
        for k, v in pairs(frame.args) do
            args = frame.args
            break
        end
    else
        -- We're being called from another module or from the debug console, so assume
        -- the args are passed in directly.
        args = frame
    end
    return _delink(args)
end

return p