|
1 | 1 | {-# LANGUAGE BangPatterns #-}
|
2 | 2 | {-# LANGUAGE OverloadedStrings #-}
|
| 3 | +{-# LANGUAGE RankNTypes #-} |
3 | 4 | {-# LANGUAGE TupleSections #-}
|
4 | 5 | module WordFreq (
|
5 | 6 | loadWikiDump, WordFreq, filterWordFreq, foldWordFreq, addWord
|
6 | 7 | , commonSequences, normalize, byFrequency
|
7 | 8 | , loadWordList, knownWord
|
8 | 9 | ) where
|
9 |
| - |
| 10 | +import Debug.Trace (traceShow) |
10 | 11 | import Control.Applicative ((<|>))
|
11 | 12 | import Control.Monad (void)
|
12 | 13 | import Control.Monad.Extra (ifM)
|
13 | 14 | import Control.Parallel.Strategies
|
14 |
| -import Data.Attoparsec.Text |
| 15 | +import Data.Attoparsec.Text hiding (take) |
15 | 16 | import Data.Char (isAlpha)
|
16 | 17 | import Data.Foldable (foldl')
|
17 |
| -import Data.List (groupBy, sort, sortOn) |
| 18 | +import Data.List (groupBy, sort, sortBy, sortOn) |
18 | 19 | import Data.Map.Strict (Map)
|
19 | 20 | import Data.IntMap.Strict (IntMap)
|
20 | 21 | import qualified Data.Map.Strict as Map
|
@@ -70,27 +71,35 @@ commonSequences = WordFreq
|
70 | 71 | . deleteInfix . Map.foldrWithKey' subseqs mempty
|
71 | 72 | . unWordFreq where
|
72 | 73 | subseqs :: Text -> Int -> Map Text Int -> Map Text Int
|
73 |
| - subseqs k a !m = let l = Text.length k |
| 74 | + subseqs k a !m = let lc = Text.toLower k |
| 75 | + l = Text.length lc |
74 | 76 | in foldl' (flip $ uncurry $ Map.insertWith (+)) m [
|
75 |
| - (Text.toLower $ Text.take n $ Text.drop i k, a) |
| 77 | + (Text.take n $ Text.drop i lc, a) |
76 | 78 | | i <- [0 .. l - 1], n <- [1 .. l - i]
|
77 | 79 | ]
|
78 | 80 | deleteInfix m = Map.withoutKeys m $ mconcat $
|
79 | 81 | withStrategy (parList rdeepseq) $ map search $
|
80 |
| - foldMap (breakDownBy Text.length) $ |
| 82 | + foldMap (breakBy $ Down . Text.length) $ |
81 | 83 | Map.foldrWithKey' invert mempty m where
|
82 | 84 | invert k a = IntMap.insertWith mappend a [k]
|
83 | 85 | search (l, r) = Set.fromList $ concatMap (\x -> filter (`Text.isInfixOf` x) r) l
|
84 |
| - breakDownBy by = go . sortOn (Down . by) where |
| 86 | + breakBy by = go . sortOn fst . map (\a -> (by a, a)) where |
| 87 | + chunksOf i ls = map (take i) (splitter ls) where |
| 88 | + splitter [] = [] |
| 89 | + splitter l = l : splitter (drop i l) |
85 | 90 | go [] = []
|
86 |
| - go (x:xs) = let b = by x in case break ((b /=) . by) xs of |
| 91 | + go ((b, a):xs) = case break ((b /=) . fst) xs of |
87 | 92 | (_, []) -> []
|
88 |
| - (xs', ys) -> (x:xs', ys) : go ys |
| 93 | + (xs', ys) -> let !ys' = map snd ys |
| 94 | + pieces = chunksOf 1000 $ a:map snd xs' |
| 95 | + in map (, ys') pieces ++ go ys |
89 | 96 |
|
90 | 97 | byFrequency :: WordFreq Text -> [(Int, Text)]
|
91 |
| -byFrequency = IntMap.foldlWithKey' g [] . Map.foldrWithKey' f mempty . unWordFreq where |
| 98 | +byFrequency = IntMap.foldlWithKey' g [] |
| 99 | + . withStrategy (parTraversable rdeepseq) . fmap sort |
| 100 | + . Map.foldrWithKey' f mempty . unWordFreq where |
92 | 101 | f k a = IntMap.insertWith (const (k:)) a [k]
|
93 |
| - g xs a ks = map (a,) (sort ks) ++ xs |
| 102 | + g xs a ks = map (a,) ks ++ xs |
94 | 103 |
|
95 | 104 | loadWordList :: FilePath -> IO (Set Text)
|
96 | 105 | loadWordList = fmap (Set.fromList . map Text.toLower . Text.lines)
|
|
0 commit comments