Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Comparator;
- import java.util.List;
- import java.io.*;
- import java.util.*;
- import java.util.stream.Collectors;
- class TextProcessor {
- List<Text> texts;
- Set<String> allWords;
- Map<String,Integer> wordsByFrequency;
- CosineSimilarityCalculator cosineSimilarityCalculator;
- public TextProcessor() {
- this.texts = new ArrayList<>();
- allWords = new TreeSet<>();
- this.wordsByFrequency= new HashMap<>();
- }
- public void readText(InputStream in) {
- BufferedReader br= new BufferedReader(new InputStreamReader(in));
- texts= br.lines().map(Text::create).collect(Collectors.toList());
- texts.stream().flatMap(i->i.words.stream()).forEach(allWords::add);
- texts.stream().flatMap(i->i.words.stream()).forEach(i->wordsByFrequency.putIfAbsent(i,0));
- texts.stream().flatMap(i->i.words.stream()).forEach(i->wordsByFrequency.computeIfPresent(i,
- (k,v)->v+1));
- }
- public void printTextsVectors(PrintStream out) {
- PrintWriter printWriter= new PrintWriter(out);
- List<List<Integer>> vectors= new ArrayList<>();
- for(Text text: texts) {
- List<Integer> vector= new ArrayList<>();
- for(String word : allWords){
- vector.add(getFrequencyNumber(text,word));
- }
- vectors.add(vector);
- }
- vectors.stream().forEach(i->printWriter.println(i));
- printWriter.flush();
- }
- private int getFrequencyNumber(Text text, String word) {
- int counter=0;
- for(String s : text.words){
- if(word.equals(s))
- counter++;
- }
- return counter;
- }
- public void printCorpus(PrintStream out, int n, boolean b) {
- PrintWriter printWriter= new PrintWriter(out);
- if(!b)
- wordsByFrequency.entrySet().stream().sorted(Comparator.comparing((Map.Entry<String, Integer> entry) -> entry.getValue(), Comparator.reverseOrder())
- .thenComparing(entry-> entry.getKey(),Comparator.naturalOrder())).limit(n)
- .forEach(i->printWriter.println(String.format("%s : %d",i.getKey(),i.getValue())));
- else
- wordsByFrequency.entrySet().stream().sorted(Comparator.comparing((Map.Entry<String, Integer> entry) -> entry.getValue(), Comparator.naturalOrder())
- .thenComparing(entry-> entry.getKey(),Comparator.naturalOrder())).limit(n)
- .forEach(i->printWriter.println(String.format("%s : %d",i.getKey(),i.getValue())));
- printWriter.flush();
- }
- public void mostSimilarTexts(PrintStream out) {
- PrintWriter printWriter= new PrintWriter(out);
- List<List<Integer>> vectors= new ArrayList<>();
- for(Text text: texts) {
- List<Integer> vector= new ArrayList<>();
- for(String word : allWords){
- vector.add(getFrequencyNumber(text,word));
- }
- vectors.add(vector);
- }
- double mostSimular=0;
- int textIndex1=0;
- int textIndex2=0;
- for(int i=0;i<vectors.size();i++) {
- for(int j=i+1;j<vectors.size();j++) {
- if(cosineSimilarityCalculator.cosineSimilarity(vectors.get(i),vectors.get(j))>mostSimular){
- mostSimular= cosineSimilarityCalculator.cosineSimilarity(vectors.get(i),vectors.get(j));
- textIndex1=i;
- textIndex2=j;
- }
- }
- }
- printWriter.println(texts.get(textIndex1));
- printWriter.println(texts.get(textIndex2));
- printWriter.println(String.format("%.10f",mostSimular));
- printWriter.flush();
- }
- }
- class Text{
- List<String> words;
- public Text(List<String> words) {
- this.words = words;
- }
- public static Text create(String line){
- String[] parts = line.split("\\s+");
- List<String> finalWords = new ArrayList<>();
- for(String word : parts){
- for(int i=0;i<word.length();i++){
- if(!Character.isLetter(word.charAt(i))){
- if(i!=word.length()-1)
- word = word.substring(0,i)+word.substring(i+1);
- else
- word = word.substring(0,i);
- i--;
- }
- }
- finalWords.add(word.toLowerCase());
- }
- return new Text(finalWords);
- }
- @Override
- public String toString() {
- StringBuilder stringBuilder = new StringBuilder();
- for(int i=0;i<words.size();i++){
- if(i==words.size()-1){
- stringBuilder.append(words.get(i));
- break;
- }
- stringBuilder.append(words.get(i)).append(" ");
- }
- return stringBuilder.toString();
- }
- }
- class CosineSimilarityCalculator {
- public static double cosineSimilarity (Collection<Integer> c1, Collection<Integer> c2) {
- int [] array1;
- int [] array2;
- array1 = c1.stream().mapToInt(i -> i).toArray();
- array2 = c2.stream().mapToInt(i -> i).toArray();
- double up = 0.0;
- double down1=0, down2=0;
- for (int i=0;i<c1.size();i++) {
- up+=(array1[i] * array2[i]);
- }
- for (int i=0;i<c1.size();i++) {
- down1+=(array1[i]*array1[i]);
- }
- for (int i=0;i<c1.size();i++) {
- down2+=(array2[i]*array2[i]);
- }
- return up/(Math.sqrt(down1)*Math.sqrt(down2));
- }
- }
- public class TextProcessorTest {
- public static void main(String[] args) {
- TextProcessor textProcessor = new TextProcessor();
- textProcessor.readText(System.in);
- System.out.println("===PRINT VECTORS===");
- textProcessor.printTextsVectors(System.out);
- System.out.println("PRINT FIRST 20 WORDS SORTED ASCENDING BY FREQUENCY ");
- textProcessor.printCorpus(System.out, 20, true);
- System.out.println("PRINT FIRST 20 WORDS SORTED DESCENDING BY FREQUENCY");
- textProcessor.printCorpus(System.out, 20, false);
- System.out.println("===MOST SIMILAR TEXTS===");
- textProcessor.mostSimilarTexts(System.out);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement