|
|
|
module internal DataProcessing
|
|
|
|
|
|
|
|
open System.Text.RegularExpressions
|
|
|
|
open DataCleaning
|
|
|
|
open System
|
|
|
|
open System.IO
|
|
|
|
|
|
|
|
let MatchText pattern text = Regex.IsMatch(text, pattern)
|
|
|
|
|
|
|
|
let ConcatToString words = String.concat " " words
|
|
|
|
|
|
|
|
let GibberishLevelIsValid gibberishLevel =
|
|
|
|
match gibberishLevel with
|
|
|
|
| gibberishLevel when gibberishLevel < 2 || gibberishLevel > 20 ->
|
|
|
|
invalidArg "gibberishLevel" "Invalid argument. Must be between 2 and 20 (inclusive)."
|
|
|
|
| _ -> ignore
|
|
|
|
|
|
|
|
let SentencesIsValid sentences =
|
|
|
|
match sentences with
|
|
|
|
| sentences when sentences < 1 ->
|
|
|
|
invalidArg "sentences" "Invalid argument. Must be greater than 0."
|
|
|
|
| _ -> ignore
|
|
|
|
|
|
|
|
let FilePathIsValid filePath =
|
|
|
|
match filePath with
|
|
|
|
| filePath when Path.GetExtension filePath <> ".txt" ->
|
|
|
|
invalidArg "filePath" "Invalid argument. File must be a .txt file."
|
|
|
|
| filePath when not (File.Exists filePath) ->
|
|
|
|
raise (FileNotFoundException("Unable to find the file at the location specified."))
|
|
|
|
| _ -> ignore
|
|
|
|
|
|
|
|
let TextContainsValidEndToken (text: string) =
|
|
|
|
match text with
|
|
|
|
| text when text.Contains "." -> true
|
|
|
|
| text when text.Contains "!" -> true
|
|
|
|
| text when text.Contains "?" -> true
|
|
|
|
| _ -> false
|
|
|
|
|
|
|
|
let SortIntoGroups groupSize text =
|
|
|
|
SplitText @"\s+" text // Splits text where there is a space.
|
|
|
|
|> Seq.windowed groupSize
|
|
|
|
|
|
|
|
let BisectWords words =
|
|
|
|
let length = Array.length words
|
|
|
|
let start =
|
|
|
|
words
|
|
|
|
|> Seq.take (length - 1)
|
|
|
|
|> ConcatToString
|
|
|
|
(start, words.[length - 1])
|
|
|
|
|
|
|
|
let CombineWords prev next =
|
|
|
|
[prev; next]
|
|
|
|
|> List.filter(fun s -> not (String.IsNullOrWhiteSpace s))
|
|
|
|
|> ConcatToString
|
|
|
|
|
|
|
|
let ApplyStandardSetup text =
|
|
|
|
text
|
|
|
|
|> ReplaceArtifact "\""
|
|
|
|
|> ReplaceArtifact "\n\nIn"
|
|
|
|
|> ReplaceArtifact "\r"
|
|
|
|
|> ReplaceArtifact "\n"
|
|
|
|
|> SplitText @"\s+"
|
|
|
|
|> ConcatToString
|