module Licensee::ContentHelper

Constants

ALT_TITLE_REGEX
DIGEST
END_OF_TERMS_REGEX
HR_REGEX
MAX_SCALED_DELTA

Public Instance Methods

content_normalized() click to toggle source

Content without title, version, copyright, whitespace, or insturctions

# File lib/licensee/content_helper.rb, line 68
def content_normalized
  return unless content
  @content_normalized ||= begin
    string = content_without_title_and_version.downcase
    while string =~ Matchers::Copyright::REGEX
      string = strip_copyright(string)
    end
    string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
    strip_whitespace(string)
  end
end
content_without_title_and_version() click to toggle source

Content with the title and version removed The first time should normally be the attribution line Used to dry up `content_normalized` but we need the case sensitive content with attribution first to detect attribuion in LicenseFile

# File lib/licensee/content_helper.rb, line 57
def content_without_title_and_version
  @content_without_title_and_version ||= begin
    string = content.strip
    string = strip_markdown_headings(string)
    string = strip_hrs(string)
    string = strip_title(string) while string =~ title_regex
    strip_version(string).strip
  end
end
hash() click to toggle source

SHA1 of the normalized content

# File lib/licensee/content_helper.rb, line 49
def hash
  @hash ||= DIGEST.hexdigest content_normalized
end
length() click to toggle source

Number of characteres in the normalized content

# File lib/licensee/content_helper.rb, line 24
def length
  return 0 unless content_normalized
  content_normalized.length
end
length_delta(other) click to toggle source

Given another license or project file, calculates the difference in length

# File lib/licensee/content_helper.rb, line 36
def length_delta(other)
  (length - other.length).abs
end
max_delta() click to toggle source

Number of characters that could be added/removed to still be considered a potential match

# File lib/licensee/content_helper.rb, line 31
def max_delta
  scaled_delta < MAX_SCALED_DELTA ? scaled_delta : MAX_SCALED_DELTA
end
similarity(other) click to toggle source

Given another license or project file, calculates the similarity as a percentage of words in common

# File lib/licensee/content_helper.rb, line 42
def similarity(other)
  overlap = (wordset & other.wordset).size
  total = wordset.size + other.wordset.size
  100.0 * (overlap * 2.0 / total)
end
wordset() click to toggle source

A set of each word in the license, without duplicates

# File lib/licensee/content_helper.rb, line 17
def wordset
  @wordset ||= if content_normalized
    content_normalized.scan(/[\w']+/).to_set
  end
end

Private Instance Methods

license_names() click to toggle source
# File lib/licensee/content_helper.rb, line 82
def license_names
  @license_titles ||= License.all(hidden: true).map do |license|
    regex = ALT_TITLE_REGEX[license.key]
    regex || license.name_without_version.downcase.sub('*', 'u')
  end
end
scaled_delta() click to toggle source
# File lib/licensee/content_helper.rb, line 119
def scaled_delta
  @scaled_delta ||= (length * Licensee.inverse_confidence_threshold).to_i
end
strip_hrs(string) click to toggle source

Strip HRs from MPL

# File lib/licensee/content_helper.rb, line 106
def strip_hrs(string)
  string.gsub HR_REGEX, ''
end
strip_markdown_headings(string) click to toggle source

Strip leading s from the document

# File lib/licensee/content_helper.rb, line 111
def strip_markdown_headings(string)
  string.sub(/\A\s*#+/, '').strip
end
strip_title(string) click to toggle source
# File lib/licensee/content_helper.rb, line 93
def strip_title(string)
  string.sub(title_regex, '').strip
end
strip_version(string) click to toggle source
# File lib/licensee/content_helper.rb, line 97
def strip_version(string)
  string.sub(/\Aversion.*$/i, '').strip
end
strip_whitespace(string) click to toggle source
# File lib/licensee/content_helper.rb, line 115
def strip_whitespace(string)
  string.tr("\n", ' ').squeeze(' ').strip
end
title_regex() click to toggle source
# File lib/licensee/content_helper.rb, line 89
def title_regex
  /\A\(?(the )?(#{Regexp.union(license_names).source}).*$/i
end