Module:labels/data/lang/zh/functions

From Wiktionary, the free dictionary
< Module:labels‎ | data‎ | lang‎ | zh
Jump to navigation Jump to search

local export = {}

export.postprocess_handlers = {}

local labels_module = "Module:labels"

-- Remove duplicated labels like 'Taiwanese' in 'Taiwanese Hokkien|and|Taiwanese Hakka'. Also remove duplicated labels
-- in things like
-- * 'Quanzhou|_|Hokkien' (which canonicalizes to 'Quanzhou Hokkien|_|Hokkien');
-- * 'Xiamen|and|Quanzhou|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|and|Quanzhou Hokkien|_|Hokkien');
-- * 'Xiamen|and|Anxi|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|and|Anxi|_|Hokkien');
-- * 'Xiamen|Zhangzhou|and|Quanzhou|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|Zhangzhou Hokkien|and|Quanzhou Hokkien|_|Hokkien');
-- * 'Xiamen|Zhangzhou|and|Anxi|_|Hokkien' (which canonicalizes to 'Xiamen Hokkien|Zhangzhou Hokkien|and|Anxi|_|Hokkien').
-- We do two passes. The first pass fixes cases like 'Quanzhou Hokkien|_|Hokkien', irrespective of whether there's an
-- "and" present. The second pass looks for a stretch of labels where (a) all of the labels have the same prefix or
-- suffix, and (b) in between the labels is at least one occurrence of "and" (which can also start out as "&" but is
-- canonicalized to "and"); but (c) we count two labels separated by "_" (which is canonicalized to a blank label) as a
-- single label.
table.insert(export.postprocess_handlers,
	function(data)
		local labels = data.labels
		if #labels == 1 then
			return
		end
		local m_labels = require(labels_module)

		-- First, split the labels into `link` and `display` component parts (done only once).
		local split_labels = {}
		for i, label in ipairs(labels) do
			local link, display = m_labels.split_display_form(label.label)
			split_labels[i] = {link = link, display = display}
		end

		-- Then compute "label starts" (indices of label sets to consider when looking for runs with the same prefix or
		-- suffix), where a label start is either a single label or a set of two labels separated by an underscore,
		-- and where we take occurrences of "and" into consideration.
		local label_starts = {}
		local i = 1
		while i <= #labels do
			local start = i
			local followed_by_and = false
			local after_underscore
			if i <= #labels - 4 and labels[i + 1].label == "" and labels[i + 2] == "and" and labels[i + 3] == "" then
				-- 'Foo|_|and|_|Bar'; redundant underscores
				followed_by_and = true
				i = i + 3
			elseif i <= #labels - 2 and labels[i + 1].label == "and" then
				followed_by_and = true
				i = i + 1
			elseif i <= #labels - 2 and labels[i + 1].label == "" then
				after_underscore = i + 2
				i = i + 1
			end
			table.insert(label_starts, {
				start = start,
				followed_by_and = followed_by_and,
				after_underscore = after_underscore
			})
			i = i + 1
		end

		-- Now the main loop.

		-- Each spec is {"affix", `at_beginning`}, or {{"affix", "affix"}, `at_beginning`} where "affix" is a prefix or
		-- suffix to remove and `at_beginning` indicates whether "affix" is a prefix or suffix. If more than one affix
		-- is listed, any affix counts, e.g. 'Taiwan Mandarin|and|Taiwanese Hokkien'.
		for _, affix_spec in ipairs {
			{{"Taiwanese", "Taiwan"}, true}, {"Chinese"}, {"Gan"}, {"Hakka"}, {"Hokkien"}, {"Mandarin"},
			-- Min needs to go before Southern Min, Eastern Min, etc. because the later check for e.g. Eastern Min
			-- will overwrite the value set by Min if both match. With Min later, we'll end up with e.g.
			-- "Fuqing Eastern Eastern Min".
			{"Min"}, {"Southern Min"}, {"Eastern Min"}, {"Northern Min"}, {"Central Min"}, {"Wu"}, {"Xiang"}
		} do
			local affixes, at_beginning = unpack(affix_spec)
			if type(affixes) == "string" then
				affixes = {affixes}
			end

			-- Does `item` match against the prefix or suffix when both prefix/suffix and something else are
			-- present? If so, return the something else, which is what we need to set the label to if we remove
			-- the prefix/suffix.
			local function matches_affix_with_space(item)
				for _, affix in ipairs(affixes) do
					local space_regex = at_beginning and "^" .. affix .. " (.+)$" or "^(.+) " .. affix .. "$"
					local rest = item:match(space_regex)
					if rest then
						return rest
					end
				end
				return false
			end
			-- Does `item` match against the prefix or suffix exactly? If so, return an empty string, which is what
			-- we need to set the label to if we remove the prefix/suffix.
			local function matches_affix_exactly(item)
				for _, affix in ipairs(affixes) do
					if item == affix then
						return ""
					end
				end
				return false
			end
			-- Does the link or display at `label_index` match with `match_function`? If so, return a three-element
			-- list of `label_index`, `component` (either "link" or "display") and the return value of `match_function`.
			local function check_match(label_index, match_function)
				local link, display = split_labels[label_index].link, split_labels[label_index].display
				local rest = display and match_function(display)
				if rest then
					return {label_index, "display", rest}
				else
					rest = link and matches_affix_with_space(link)
					if rest then
						return {label_index, "link", rest}
					end
				end
				return nil
			end
			-- Given {`label_index`, `component`, `value`}, set the link or display component (depending on `component`)
			-- of the label at `label_index` to `value`.
			local function set_component_value(to_erase)
				local label_index, component, value = unpack(to_erase)
				if value == "" then
					labels[label_index].label = ""
				else
					local link, display = split_labels[label_index].link, split_labels[label_index].display
					if component == "display" then
						display = value
					else
						link = value
					end
					labels[label_index].label = m_labels.combine_display_form_parts(link, display)
				end
			end

			-- First pass: Look for two labels separated by an underscore, with the suffix occurring on both parts.
			-- (This shouldn't happen with prefixes.)
			if not at_beginning then
				for _, label_start in ipairs(label_starts) do
					local to_erase = check_match(label_start.start, matches_affix_with_space)
					if to_erase and label_start.after_underscore and
						check_match(label_start.after_underscore, matches_affix_exactly) then
						set_component_value(to_erase)
					end
				end
			end

			-- Second pass.

			-- Check whether a prefix or suffix matches the given label start index (index of a label set in the
			-- `label_starts` list; see above). If it matches, return value is {`index`, `component`, `value`}, i.e.
			-- the label index to change, the component ("link" or "display") to change and the value to set the
			-- component to. Otherwise, return nil.
			local function affix_matches(label_start_index)
				local label_start = label_starts[label_start_index]
				-- If we're dealing with a suffix, there are two cases: (1) 'Quanzhou Hokkien';
				-- (2) 'Quanzhou|_|Hokkien'. If we're dealing with a prefix, there are similarly (1) 'Taiwanese Hakka';
				-- (2) 'Taiwanese|_|Hakka'. In addition, we have to check both the link and the display.
				local to_erase = check_match(label_start.start, matches_affix_with_space)
				if to_erase then
					return to_erase
				end
				local after_underscore = label_start.after_underscore
				if not after_underscore then
					return nil
				end
				return check_match(at_beginning and label_start.start or after_underscore, matches_affix_exactly)
			end

			-- Now, try to find a run of two or more label sets with the same prefix or suffix, with at least one "and"
			-- in the middle.
			local j = 1
			while j <= #label_starts - 1 do
				local saw_and = false
				local run = {}
				local match = affix_matches(j)
				if match then
					table.insert(run, match)
					local k = j + 1
					while k <= #label_starts do
						match = affix_matches(k)
						if not match then
							break
						end
						table.insert(run, match)
						if label_starts[k - 1].followed_by_and then
							saw_and = true
						end
						k = k + 1
					end
					if #run > 1 and saw_and then
						-- We saw a run of two or more with at least one 'and' in the middle. Remove the prefix or
						-- suffix from all but the last (if we're dealing with a suffix) or all but the first (if we're
						-- dealing with a prefix).
						if at_beginning then
							table.remove(run, 1)
						else
							table.remove(run)
						end
						for _, to_erase in ipairs(run) do
							set_component_value(to_erase)
						end
					end
					j = k + 1
				else
					j = j + 1
				end
			end
		end
	end
)

return export